Revert "bsg-lib: don't free job in bsg_prepare_job"
[linux/fpc-iii.git] / net / unix / af_unix.c
blob29b1f4dc48ca2386c94c07c6dc3bd20377e98855
1 /*
2 * NET4: Implementation of BSD Unix domain sockets.
4 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
11 * Fixes:
12 * Linus Torvalds : Assorted bug cures.
13 * Niibe Yutaka : async I/O support.
14 * Carsten Paeth : PF_UNIX check, address fixes.
15 * Alan Cox : Limit size of allocated blocks.
16 * Alan Cox : Fixed the stupid socketpair bug.
17 * Alan Cox : BSD compatibility fine tuning.
18 * Alan Cox : Fixed a bug in connect when interrupted.
19 * Alan Cox : Sorted out a proper draft version of
20 * file descriptor passing hacked up from
21 * Mike Shaver's work.
22 * Marty Leisner : Fixes to fd passing
23 * Nick Nevin : recvmsg bugfix.
24 * Alan Cox : Started proper garbage collector
25 * Heiko EiBfeldt : Missing verify_area check
26 * Alan Cox : Started POSIXisms
27 * Andreas Schwab : Replace inode by dentry for proper
28 * reference counting
29 * Kirk Petersen : Made this a module
30 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
31 * Lots of bug fixes.
32 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
33 * by above two patches.
34 * Andrea Arcangeli : If possible we block in connect(2)
35 * if the max backlog of the listen socket
36 * is been reached. This won't break
37 * old apps and it will avoid huge amount
38 * of socks hashed (this for unix_gc()
39 * performances reasons).
40 * Security fix that limits the max
41 * number of socks to 2*max_files and
42 * the number of skb queueable in the
43 * dgram receiver.
44 * Artur Skawina : Hash function optimizations
45 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
46 * Malcolm Beattie : Set peercred for socketpair
47 * Michal Ostrowski : Module initialization cleanup.
48 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
49 * the core infrastructure is doing that
50 * for all net proto families now (2.5.69+)
53 * Known differences from reference BSD that was tested:
55 * [TO FIX]
56 * ECONNREFUSED is not returned from one end of a connected() socket to the
57 * other the moment one end closes.
58 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 * [NOT TO FIX]
61 * accept() returns a path name even if the connecting socket has closed
62 * in the meantime (BSD loses the path and gives up).
63 * accept() returns 0 length path for an unbound connector. BSD returns 16
64 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 * BSD af_unix apparently has connect forgetting to block properly.
67 * (need to check this with the POSIX spec in detail)
69 * Differences from 2.0.0-11-... (ANK)
70 * Bug fixes and improvements.
71 * - client shutdown killed server socket.
72 * - removed all useless cli/sti pairs.
74 * Semantic changes/extensions.
75 * - generic control message passing.
76 * - SCM_CREDENTIALS control message.
77 * - "Abstract" (not FS based) socket bindings.
78 * Abstract names are sequences of bytes (not zero terminated)
79 * started by 0, so that this name space does not intersect
80 * with BSD names.
83 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
85 #include <linux/module.h>
86 #include <linux/kernel.h>
87 #include <linux/signal.h>
88 #include <linux/sched.h>
89 #include <linux/errno.h>
90 #include <linux/string.h>
91 #include <linux/stat.h>
92 #include <linux/dcache.h>
93 #include <linux/namei.h>
94 #include <linux/socket.h>
95 #include <linux/un.h>
96 #include <linux/fcntl.h>
97 #include <linux/termios.h>
98 #include <linux/sockios.h>
99 #include <linux/net.h>
100 #include <linux/in.h>
101 #include <linux/fs.h>
102 #include <linux/slab.h>
103 #include <asm/uaccess.h>
104 #include <linux/skbuff.h>
105 #include <linux/netdevice.h>
106 #include <net/net_namespace.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/rtnetlink.h>
116 #include <linux/mount.h>
117 #include <net/checksum.h>
118 #include <linux/security.h>
119 #include <linux/freezer.h>
121 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
122 EXPORT_SYMBOL_GPL(unix_socket_table);
123 DEFINE_SPINLOCK(unix_table_lock);
124 EXPORT_SYMBOL_GPL(unix_table_lock);
125 static atomic_long_t unix_nr_socks;
128 static struct hlist_head *unix_sockets_unbound(void *addr)
130 unsigned long hash = (unsigned long)addr;
132 hash ^= hash >> 16;
133 hash ^= hash >> 8;
134 hash %= UNIX_HASH_SIZE;
135 return &unix_socket_table[UNIX_HASH_SIZE + hash];
138 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
140 #ifdef CONFIG_SECURITY_NETWORK
141 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
143 memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
146 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
148 scm->secid = *UNIXSID(skb);
150 #else
151 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
154 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
156 #endif /* CONFIG_SECURITY_NETWORK */
159 * SMP locking strategy:
160 * hash table is protected with spinlock unix_table_lock
161 * each socket state is protected by separate spin lock.
164 static inline unsigned int unix_hash_fold(__wsum n)
166 unsigned int hash = (__force unsigned int)csum_fold(n);
168 hash ^= hash>>8;
169 return hash&(UNIX_HASH_SIZE-1);
172 #define unix_peer(sk) (unix_sk(sk)->peer)
174 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
176 return unix_peer(osk) == sk;
179 static inline int unix_may_send(struct sock *sk, struct sock *osk)
181 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
184 static inline int unix_recvq_full(struct sock const *sk)
186 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
189 struct sock *unix_peer_get(struct sock *s)
191 struct sock *peer;
193 unix_state_lock(s);
194 peer = unix_peer(s);
195 if (peer)
196 sock_hold(peer);
197 unix_state_unlock(s);
198 return peer;
200 EXPORT_SYMBOL_GPL(unix_peer_get);
202 static inline void unix_release_addr(struct unix_address *addr)
204 if (atomic_dec_and_test(&addr->refcnt))
205 kfree(addr);
209 * Check unix socket name:
210 * - should be not zero length.
211 * - if started by not zero, should be NULL terminated (FS object)
212 * - if started by zero, it is abstract name.
215 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
217 if (len <= sizeof(short) || len > sizeof(*sunaddr))
218 return -EINVAL;
219 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
220 return -EINVAL;
221 if (sunaddr->sun_path[0]) {
223 * This may look like an off by one error but it is a bit more
224 * subtle. 108 is the longest valid AF_UNIX path for a binding.
225 * sun_path[108] doesn't as such exist. However in kernel space
226 * we are guaranteed that it is a valid memory location in our
227 * kernel address buffer.
229 ((char *)sunaddr)[len] = 0;
230 len = strlen(sunaddr->sun_path)+1+sizeof(short);
231 return len;
234 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
235 return len;
238 static void __unix_remove_socket(struct sock *sk)
240 sk_del_node_init(sk);
243 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
245 WARN_ON(!sk_unhashed(sk));
246 sk_add_node(sk, list);
249 static inline void unix_remove_socket(struct sock *sk)
251 spin_lock(&unix_table_lock);
252 __unix_remove_socket(sk);
253 spin_unlock(&unix_table_lock);
256 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
258 spin_lock(&unix_table_lock);
259 __unix_insert_socket(list, sk);
260 spin_unlock(&unix_table_lock);
263 static struct sock *__unix_find_socket_byname(struct net *net,
264 struct sockaddr_un *sunname,
265 int len, int type, unsigned int hash)
267 struct sock *s;
269 sk_for_each(s, &unix_socket_table[hash ^ type]) {
270 struct unix_sock *u = unix_sk(s);
272 if (!net_eq(sock_net(s), net))
273 continue;
275 if (u->addr->len == len &&
276 !memcmp(u->addr->name, sunname, len))
277 goto found;
279 s = NULL;
280 found:
281 return s;
284 static inline struct sock *unix_find_socket_byname(struct net *net,
285 struct sockaddr_un *sunname,
286 int len, int type,
287 unsigned int hash)
289 struct sock *s;
291 spin_lock(&unix_table_lock);
292 s = __unix_find_socket_byname(net, sunname, len, type, hash);
293 if (s)
294 sock_hold(s);
295 spin_unlock(&unix_table_lock);
296 return s;
299 static struct sock *unix_find_socket_byinode(struct inode *i)
301 struct sock *s;
303 spin_lock(&unix_table_lock);
304 sk_for_each(s,
305 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
306 struct dentry *dentry = unix_sk(s)->path.dentry;
308 if (dentry && d_backing_inode(dentry) == i) {
309 sock_hold(s);
310 goto found;
313 s = NULL;
314 found:
315 spin_unlock(&unix_table_lock);
316 return s;
319 /* Support code for asymmetrically connected dgram sockets
321 * If a datagram socket is connected to a socket not itself connected
322 * to the first socket (eg, /dev/log), clients may only enqueue more
323 * messages if the present receive queue of the server socket is not
324 * "too large". This means there's a second writeability condition
325 * poll and sendmsg need to test. The dgram recv code will do a wake
326 * up on the peer_wait wait queue of a socket upon reception of a
327 * datagram which needs to be propagated to sleeping would-be writers
328 * since these might not have sent anything so far. This can't be
329 * accomplished via poll_wait because the lifetime of the server
330 * socket might be less than that of its clients if these break their
331 * association with it or if the server socket is closed while clients
332 * are still connected to it and there's no way to inform "a polling
333 * implementation" that it should let go of a certain wait queue
335 * In order to propagate a wake up, a wait_queue_t of the client
336 * socket is enqueued on the peer_wait queue of the server socket
337 * whose wake function does a wake_up on the ordinary client socket
338 * wait queue. This connection is established whenever a write (or
339 * poll for write) hit the flow control condition and broken when the
340 * association to the server socket is dissolved or after a wake up
341 * was relayed.
344 static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
345 void *key)
347 struct unix_sock *u;
348 wait_queue_head_t *u_sleep;
350 u = container_of(q, struct unix_sock, peer_wake);
352 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
354 u->peer_wake.private = NULL;
356 /* relaying can only happen while the wq still exists */
357 u_sleep = sk_sleep(&u->sk);
358 if (u_sleep)
359 wake_up_interruptible_poll(u_sleep, key);
361 return 0;
364 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
366 struct unix_sock *u, *u_other;
367 int rc;
369 u = unix_sk(sk);
370 u_other = unix_sk(other);
371 rc = 0;
372 spin_lock(&u_other->peer_wait.lock);
374 if (!u->peer_wake.private) {
375 u->peer_wake.private = other;
376 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
378 rc = 1;
381 spin_unlock(&u_other->peer_wait.lock);
382 return rc;
385 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
386 struct sock *other)
388 struct unix_sock *u, *u_other;
390 u = unix_sk(sk);
391 u_other = unix_sk(other);
392 spin_lock(&u_other->peer_wait.lock);
394 if (u->peer_wake.private == other) {
395 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
396 u->peer_wake.private = NULL;
399 spin_unlock(&u_other->peer_wait.lock);
402 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
403 struct sock *other)
405 unix_dgram_peer_wake_disconnect(sk, other);
406 wake_up_interruptible_poll(sk_sleep(sk),
407 POLLOUT |
408 POLLWRNORM |
409 POLLWRBAND);
412 /* preconditions:
413 * - unix_peer(sk) == other
414 * - association is stable
416 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
418 int connected;
420 connected = unix_dgram_peer_wake_connect(sk, other);
422 if (unix_recvq_full(other))
423 return 1;
425 if (connected)
426 unix_dgram_peer_wake_disconnect(sk, other);
428 return 0;
431 static inline int unix_writable(struct sock *sk)
433 return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
436 static void unix_write_space(struct sock *sk)
438 struct socket_wq *wq;
440 rcu_read_lock();
441 if (unix_writable(sk)) {
442 wq = rcu_dereference(sk->sk_wq);
443 if (wq_has_sleeper(wq))
444 wake_up_interruptible_sync_poll(&wq->wait,
445 POLLOUT | POLLWRNORM | POLLWRBAND);
446 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
448 rcu_read_unlock();
451 /* When dgram socket disconnects (or changes its peer), we clear its receive
452 * queue of packets arrived from previous peer. First, it allows to do
453 * flow control based only on wmem_alloc; second, sk connected to peer
454 * may receive messages only from that peer. */
455 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
457 if (!skb_queue_empty(&sk->sk_receive_queue)) {
458 skb_queue_purge(&sk->sk_receive_queue);
459 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
461 /* If one link of bidirectional dgram pipe is disconnected,
462 * we signal error. Messages are lost. Do not make this,
463 * when peer was not connected to us.
465 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
466 other->sk_err = ECONNRESET;
467 other->sk_error_report(other);
472 static void unix_sock_destructor(struct sock *sk)
474 struct unix_sock *u = unix_sk(sk);
476 skb_queue_purge(&sk->sk_receive_queue);
478 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
479 WARN_ON(!sk_unhashed(sk));
480 WARN_ON(sk->sk_socket);
481 if (!sock_flag(sk, SOCK_DEAD)) {
482 pr_info("Attempt to release alive unix socket: %p\n", sk);
483 return;
486 if (u->addr)
487 unix_release_addr(u->addr);
489 atomic_long_dec(&unix_nr_socks);
490 local_bh_disable();
491 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
492 local_bh_enable();
493 #ifdef UNIX_REFCNT_DEBUG
494 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
495 atomic_long_read(&unix_nr_socks));
496 #endif
499 static void unix_release_sock(struct sock *sk, int embrion)
501 struct unix_sock *u = unix_sk(sk);
502 struct path path;
503 struct sock *skpair;
504 struct sk_buff *skb;
505 int state;
507 unix_remove_socket(sk);
509 /* Clear state */
510 unix_state_lock(sk);
511 sock_orphan(sk);
512 sk->sk_shutdown = SHUTDOWN_MASK;
513 path = u->path;
514 u->path.dentry = NULL;
515 u->path.mnt = NULL;
516 state = sk->sk_state;
517 sk->sk_state = TCP_CLOSE;
518 unix_state_unlock(sk);
520 wake_up_interruptible_all(&u->peer_wait);
522 skpair = unix_peer(sk);
524 if (skpair != NULL) {
525 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
526 unix_state_lock(skpair);
527 /* No more writes */
528 skpair->sk_shutdown = SHUTDOWN_MASK;
529 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
530 skpair->sk_err = ECONNRESET;
531 unix_state_unlock(skpair);
532 skpair->sk_state_change(skpair);
533 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
536 unix_dgram_peer_wake_disconnect(sk, skpair);
537 sock_put(skpair); /* It may now die */
538 unix_peer(sk) = NULL;
541 /* Try to flush out this socket. Throw out buffers at least */
543 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
544 if (state == TCP_LISTEN)
545 unix_release_sock(skb->sk, 1);
546 /* passed fds are erased in the kfree_skb hook */
547 kfree_skb(skb);
550 if (path.dentry)
551 path_put(&path);
553 sock_put(sk);
555 /* ---- Socket is dead now and most probably destroyed ---- */
558 * Fixme: BSD difference: In BSD all sockets connected to us get
559 * ECONNRESET and we die on the spot. In Linux we behave
560 * like files and pipes do and wait for the last
561 * dereference.
563 * Can't we simply set sock->err?
565 * What the above comment does talk about? --ANK(980817)
568 if (unix_tot_inflight)
569 unix_gc(); /* Garbage collect fds */
572 static void init_peercred(struct sock *sk)
574 put_pid(sk->sk_peer_pid);
575 if (sk->sk_peer_cred)
576 put_cred(sk->sk_peer_cred);
577 sk->sk_peer_pid = get_pid(task_tgid(current));
578 sk->sk_peer_cred = get_current_cred();
581 static void copy_peercred(struct sock *sk, struct sock *peersk)
583 put_pid(sk->sk_peer_pid);
584 if (sk->sk_peer_cred)
585 put_cred(sk->sk_peer_cred);
586 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
587 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
590 static int unix_listen(struct socket *sock, int backlog)
592 int err;
593 struct sock *sk = sock->sk;
594 struct unix_sock *u = unix_sk(sk);
595 struct pid *old_pid = NULL;
597 err = -EOPNOTSUPP;
598 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
599 goto out; /* Only stream/seqpacket sockets accept */
600 err = -EINVAL;
601 if (!u->addr)
602 goto out; /* No listens on an unbound socket */
603 unix_state_lock(sk);
604 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
605 goto out_unlock;
606 if (backlog > sk->sk_max_ack_backlog)
607 wake_up_interruptible_all(&u->peer_wait);
608 sk->sk_max_ack_backlog = backlog;
609 sk->sk_state = TCP_LISTEN;
610 /* set credentials so connect can copy them */
611 init_peercred(sk);
612 err = 0;
614 out_unlock:
615 unix_state_unlock(sk);
616 put_pid(old_pid);
617 out:
618 return err;
621 static int unix_release(struct socket *);
622 static int unix_bind(struct socket *, struct sockaddr *, int);
623 static int unix_stream_connect(struct socket *, struct sockaddr *,
624 int addr_len, int flags);
625 static int unix_socketpair(struct socket *, struct socket *);
626 static int unix_accept(struct socket *, struct socket *, int);
627 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
628 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
629 static unsigned int unix_dgram_poll(struct file *, struct socket *,
630 poll_table *);
631 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
632 static int unix_shutdown(struct socket *, int);
633 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
634 struct msghdr *, size_t);
635 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
636 struct msghdr *, size_t, int);
637 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
638 struct msghdr *, size_t);
639 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
640 struct msghdr *, size_t, int);
641 static int unix_dgram_connect(struct socket *, struct sockaddr *,
642 int, int);
643 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
644 struct msghdr *, size_t);
645 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
646 struct msghdr *, size_t, int);
648 static int unix_set_peek_off(struct sock *sk, int val)
650 struct unix_sock *u = unix_sk(sk);
652 if (mutex_lock_interruptible(&u->readlock))
653 return -EINTR;
655 sk->sk_peek_off = val;
656 mutex_unlock(&u->readlock);
658 return 0;
662 static const struct proto_ops unix_stream_ops = {
663 .family = PF_UNIX,
664 .owner = THIS_MODULE,
665 .release = unix_release,
666 .bind = unix_bind,
667 .connect = unix_stream_connect,
668 .socketpair = unix_socketpair,
669 .accept = unix_accept,
670 .getname = unix_getname,
671 .poll = unix_poll,
672 .ioctl = unix_ioctl,
673 .listen = unix_listen,
674 .shutdown = unix_shutdown,
675 .setsockopt = sock_no_setsockopt,
676 .getsockopt = sock_no_getsockopt,
677 .sendmsg = unix_stream_sendmsg,
678 .recvmsg = unix_stream_recvmsg,
679 .mmap = sock_no_mmap,
680 .sendpage = sock_no_sendpage,
681 .set_peek_off = unix_set_peek_off,
684 static const struct proto_ops unix_dgram_ops = {
685 .family = PF_UNIX,
686 .owner = THIS_MODULE,
687 .release = unix_release,
688 .bind = unix_bind,
689 .connect = unix_dgram_connect,
690 .socketpair = unix_socketpair,
691 .accept = sock_no_accept,
692 .getname = unix_getname,
693 .poll = unix_dgram_poll,
694 .ioctl = unix_ioctl,
695 .listen = sock_no_listen,
696 .shutdown = unix_shutdown,
697 .setsockopt = sock_no_setsockopt,
698 .getsockopt = sock_no_getsockopt,
699 .sendmsg = unix_dgram_sendmsg,
700 .recvmsg = unix_dgram_recvmsg,
701 .mmap = sock_no_mmap,
702 .sendpage = sock_no_sendpage,
703 .set_peek_off = unix_set_peek_off,
706 static const struct proto_ops unix_seqpacket_ops = {
707 .family = PF_UNIX,
708 .owner = THIS_MODULE,
709 .release = unix_release,
710 .bind = unix_bind,
711 .connect = unix_stream_connect,
712 .socketpair = unix_socketpair,
713 .accept = unix_accept,
714 .getname = unix_getname,
715 .poll = unix_dgram_poll,
716 .ioctl = unix_ioctl,
717 .listen = unix_listen,
718 .shutdown = unix_shutdown,
719 .setsockopt = sock_no_setsockopt,
720 .getsockopt = sock_no_getsockopt,
721 .sendmsg = unix_seqpacket_sendmsg,
722 .recvmsg = unix_seqpacket_recvmsg,
723 .mmap = sock_no_mmap,
724 .sendpage = sock_no_sendpage,
725 .set_peek_off = unix_set_peek_off,
728 static struct proto unix_proto = {
729 .name = "UNIX",
730 .owner = THIS_MODULE,
731 .obj_size = sizeof(struct unix_sock),
735 * AF_UNIX sockets do not interact with hardware, hence they
736 * dont trigger interrupts - so it's safe for them to have
737 * bh-unsafe locking for their sk_receive_queue.lock. Split off
738 * this special lock-class by reinitializing the spinlock key:
740 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
742 static struct sock *unix_create1(struct net *net, struct socket *sock)
744 struct sock *sk = NULL;
745 struct unix_sock *u;
747 atomic_long_inc(&unix_nr_socks);
748 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
749 goto out;
751 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
752 if (!sk)
753 goto out;
755 sock_init_data(sock, sk);
756 lockdep_set_class(&sk->sk_receive_queue.lock,
757 &af_unix_sk_receive_queue_lock_key);
759 sk->sk_write_space = unix_write_space;
760 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
761 sk->sk_destruct = unix_sock_destructor;
762 u = unix_sk(sk);
763 u->path.dentry = NULL;
764 u->path.mnt = NULL;
765 spin_lock_init(&u->lock);
766 atomic_long_set(&u->inflight, 0);
767 INIT_LIST_HEAD(&u->link);
768 mutex_init(&u->readlock); /* single task reading lock */
769 init_waitqueue_head(&u->peer_wait);
770 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
771 unix_insert_socket(unix_sockets_unbound(sk), sk);
772 out:
773 if (sk == NULL)
774 atomic_long_dec(&unix_nr_socks);
775 else {
776 local_bh_disable();
777 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
778 local_bh_enable();
780 return sk;
783 static int unix_create(struct net *net, struct socket *sock, int protocol,
784 int kern)
786 if (protocol && protocol != PF_UNIX)
787 return -EPROTONOSUPPORT;
789 sock->state = SS_UNCONNECTED;
791 switch (sock->type) {
792 case SOCK_STREAM:
793 sock->ops = &unix_stream_ops;
794 break;
796 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
797 * nothing uses it.
799 case SOCK_RAW:
800 sock->type = SOCK_DGRAM;
801 case SOCK_DGRAM:
802 sock->ops = &unix_dgram_ops;
803 break;
804 case SOCK_SEQPACKET:
805 sock->ops = &unix_seqpacket_ops;
806 break;
807 default:
808 return -ESOCKTNOSUPPORT;
811 return unix_create1(net, sock) ? 0 : -ENOMEM;
814 static int unix_release(struct socket *sock)
816 struct sock *sk = sock->sk;
818 if (!sk)
819 return 0;
821 unix_release_sock(sk, 0);
822 sock->sk = NULL;
824 return 0;
827 static int unix_autobind(struct socket *sock)
829 struct sock *sk = sock->sk;
830 struct net *net = sock_net(sk);
831 struct unix_sock *u = unix_sk(sk);
832 static u32 ordernum = 1;
833 struct unix_address *addr;
834 int err;
835 unsigned int retries = 0;
837 err = mutex_lock_interruptible(&u->readlock);
838 if (err)
839 return err;
841 err = 0;
842 if (u->addr)
843 goto out;
845 err = -ENOMEM;
846 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
847 if (!addr)
848 goto out;
850 addr->name->sun_family = AF_UNIX;
851 atomic_set(&addr->refcnt, 1);
853 retry:
854 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
855 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
857 spin_lock(&unix_table_lock);
858 ordernum = (ordernum+1)&0xFFFFF;
860 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
861 addr->hash)) {
862 spin_unlock(&unix_table_lock);
864 * __unix_find_socket_byname() may take long time if many names
865 * are already in use.
867 cond_resched();
868 /* Give up if all names seems to be in use. */
869 if (retries++ == 0xFFFFF) {
870 err = -ENOSPC;
871 kfree(addr);
872 goto out;
874 goto retry;
876 addr->hash ^= sk->sk_type;
878 __unix_remove_socket(sk);
879 u->addr = addr;
880 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
881 spin_unlock(&unix_table_lock);
882 err = 0;
884 out: mutex_unlock(&u->readlock);
885 return err;
888 static struct sock *unix_find_other(struct net *net,
889 struct sockaddr_un *sunname, int len,
890 int type, unsigned int hash, int *error)
892 struct sock *u;
893 struct path path;
894 int err = 0;
896 if (sunname->sun_path[0]) {
897 struct inode *inode;
898 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
899 if (err)
900 goto fail;
901 inode = d_backing_inode(path.dentry);
902 err = inode_permission(inode, MAY_WRITE);
903 if (err)
904 goto put_fail;
906 err = -ECONNREFUSED;
907 if (!S_ISSOCK(inode->i_mode))
908 goto put_fail;
909 u = unix_find_socket_byinode(inode);
910 if (!u)
911 goto put_fail;
913 if (u->sk_type == type)
914 touch_atime(&path);
916 path_put(&path);
918 err = -EPROTOTYPE;
919 if (u->sk_type != type) {
920 sock_put(u);
921 goto fail;
923 } else {
924 err = -ECONNREFUSED;
925 u = unix_find_socket_byname(net, sunname, len, type, hash);
926 if (u) {
927 struct dentry *dentry;
928 dentry = unix_sk(u)->path.dentry;
929 if (dentry)
930 touch_atime(&unix_sk(u)->path);
931 } else
932 goto fail;
934 return u;
936 put_fail:
937 path_put(&path);
938 fail:
939 *error = err;
940 return NULL;
943 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
945 struct dentry *dentry;
946 struct path path;
947 int err = 0;
949 * Get the parent directory, calculate the hash for last
950 * component.
952 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
953 err = PTR_ERR(dentry);
954 if (IS_ERR(dentry))
955 return err;
958 * All right, let's create it.
960 err = security_path_mknod(&path, dentry, mode, 0);
961 if (!err) {
962 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
963 if (!err) {
964 res->mnt = mntget(path.mnt);
965 res->dentry = dget(dentry);
968 done_path_create(&path, dentry);
969 return err;
972 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
974 struct sock *sk = sock->sk;
975 struct net *net = sock_net(sk);
976 struct unix_sock *u = unix_sk(sk);
977 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
978 char *sun_path = sunaddr->sun_path;
979 int err;
980 unsigned int hash;
981 struct unix_address *addr;
982 struct hlist_head *list;
984 err = -EINVAL;
985 if (sunaddr->sun_family != AF_UNIX)
986 goto out;
988 if (addr_len == sizeof(short)) {
989 err = unix_autobind(sock);
990 goto out;
993 err = unix_mkname(sunaddr, addr_len, &hash);
994 if (err < 0)
995 goto out;
996 addr_len = err;
998 err = mutex_lock_interruptible(&u->readlock);
999 if (err)
1000 goto out;
1002 err = -EINVAL;
1003 if (u->addr)
1004 goto out_up;
1006 err = -ENOMEM;
1007 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1008 if (!addr)
1009 goto out_up;
1011 memcpy(addr->name, sunaddr, addr_len);
1012 addr->len = addr_len;
1013 addr->hash = hash ^ sk->sk_type;
1014 atomic_set(&addr->refcnt, 1);
1016 if (sun_path[0]) {
1017 struct path path;
1018 umode_t mode = S_IFSOCK |
1019 (SOCK_INODE(sock)->i_mode & ~current_umask());
1020 err = unix_mknod(sun_path, mode, &path);
1021 if (err) {
1022 if (err == -EEXIST)
1023 err = -EADDRINUSE;
1024 unix_release_addr(addr);
1025 goto out_up;
1027 addr->hash = UNIX_HASH_SIZE;
1028 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
1029 spin_lock(&unix_table_lock);
1030 u->path = path;
1031 list = &unix_socket_table[hash];
1032 } else {
1033 spin_lock(&unix_table_lock);
1034 err = -EADDRINUSE;
1035 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1036 sk->sk_type, hash)) {
1037 unix_release_addr(addr);
1038 goto out_unlock;
1041 list = &unix_socket_table[addr->hash];
1044 err = 0;
1045 __unix_remove_socket(sk);
1046 u->addr = addr;
1047 __unix_insert_socket(list, sk);
1049 out_unlock:
1050 spin_unlock(&unix_table_lock);
1051 out_up:
1052 mutex_unlock(&u->readlock);
1053 out:
1054 return err;
1057 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1059 if (unlikely(sk1 == sk2) || !sk2) {
1060 unix_state_lock(sk1);
1061 return;
1063 if (sk1 < sk2) {
1064 unix_state_lock(sk1);
1065 unix_state_lock_nested(sk2);
1066 } else {
1067 unix_state_lock(sk2);
1068 unix_state_lock_nested(sk1);
1072 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1074 if (unlikely(sk1 == sk2) || !sk2) {
1075 unix_state_unlock(sk1);
1076 return;
1078 unix_state_unlock(sk1);
1079 unix_state_unlock(sk2);
1082 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1083 int alen, int flags)
1085 struct sock *sk = sock->sk;
1086 struct net *net = sock_net(sk);
1087 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1088 struct sock *other;
1089 unsigned int hash;
1090 int err;
1092 if (addr->sa_family != AF_UNSPEC) {
1093 err = unix_mkname(sunaddr, alen, &hash);
1094 if (err < 0)
1095 goto out;
1096 alen = err;
1098 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1099 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1100 goto out;
1102 restart:
1103 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1104 if (!other)
1105 goto out;
1107 unix_state_double_lock(sk, other);
1109 /* Apparently VFS overslept socket death. Retry. */
1110 if (sock_flag(other, SOCK_DEAD)) {
1111 unix_state_double_unlock(sk, other);
1112 sock_put(other);
1113 goto restart;
1116 err = -EPERM;
1117 if (!unix_may_send(sk, other))
1118 goto out_unlock;
1120 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1121 if (err)
1122 goto out_unlock;
1124 } else {
1126 * 1003.1g breaking connected state with AF_UNSPEC
1128 other = NULL;
1129 unix_state_double_lock(sk, other);
1133 * If it was connected, reconnect.
1135 if (unix_peer(sk)) {
1136 struct sock *old_peer = unix_peer(sk);
1137 unix_peer(sk) = other;
1138 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1140 unix_state_double_unlock(sk, other);
1142 if (other != old_peer)
1143 unix_dgram_disconnected(sk, old_peer);
1144 sock_put(old_peer);
1145 } else {
1146 unix_peer(sk) = other;
1147 unix_state_double_unlock(sk, other);
1149 return 0;
1151 out_unlock:
1152 unix_state_double_unlock(sk, other);
1153 sock_put(other);
1154 out:
1155 return err;
1158 static long unix_wait_for_peer(struct sock *other, long timeo)
1160 struct unix_sock *u = unix_sk(other);
1161 int sched;
1162 DEFINE_WAIT(wait);
1164 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1166 sched = !sock_flag(other, SOCK_DEAD) &&
1167 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1168 unix_recvq_full(other);
1170 unix_state_unlock(other);
1172 if (sched)
1173 timeo = schedule_timeout(timeo);
1175 finish_wait(&u->peer_wait, &wait);
1176 return timeo;
1179 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1180 int addr_len, int flags)
1182 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1183 struct sock *sk = sock->sk;
1184 struct net *net = sock_net(sk);
1185 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1186 struct sock *newsk = NULL;
1187 struct sock *other = NULL;
1188 struct sk_buff *skb = NULL;
1189 unsigned int hash;
1190 int st;
1191 int err;
1192 long timeo;
1194 err = unix_mkname(sunaddr, addr_len, &hash);
1195 if (err < 0)
1196 goto out;
1197 addr_len = err;
1199 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1200 (err = unix_autobind(sock)) != 0)
1201 goto out;
1203 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1205 /* First of all allocate resources.
1206 If we will make it after state is locked,
1207 we will have to recheck all again in any case.
1210 err = -ENOMEM;
1212 /* create new sock for complete connection */
1213 newsk = unix_create1(sock_net(sk), NULL);
1214 if (newsk == NULL)
1215 goto out;
1217 /* Allocate skb for sending to listening sock */
1218 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1219 if (skb == NULL)
1220 goto out;
1222 restart:
1223 /* Find listening sock. */
1224 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1225 if (!other)
1226 goto out;
1228 /* Latch state of peer */
1229 unix_state_lock(other);
1231 /* Apparently VFS overslept socket death. Retry. */
1232 if (sock_flag(other, SOCK_DEAD)) {
1233 unix_state_unlock(other);
1234 sock_put(other);
1235 goto restart;
1238 err = -ECONNREFUSED;
1239 if (other->sk_state != TCP_LISTEN)
1240 goto out_unlock;
1241 if (other->sk_shutdown & RCV_SHUTDOWN)
1242 goto out_unlock;
1244 if (unix_recvq_full(other)) {
1245 err = -EAGAIN;
1246 if (!timeo)
1247 goto out_unlock;
1249 timeo = unix_wait_for_peer(other, timeo);
1251 err = sock_intr_errno(timeo);
1252 if (signal_pending(current))
1253 goto out;
1254 sock_put(other);
1255 goto restart;
1258 /* Latch our state.
1260 It is tricky place. We need to grab our state lock and cannot
1261 drop lock on peer. It is dangerous because deadlock is
1262 possible. Connect to self case and simultaneous
1263 attempt to connect are eliminated by checking socket
1264 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1265 check this before attempt to grab lock.
1267 Well, and we have to recheck the state after socket locked.
1269 st = sk->sk_state;
1271 switch (st) {
1272 case TCP_CLOSE:
1273 /* This is ok... continue with connect */
1274 break;
1275 case TCP_ESTABLISHED:
1276 /* Socket is already connected */
1277 err = -EISCONN;
1278 goto out_unlock;
1279 default:
1280 err = -EINVAL;
1281 goto out_unlock;
1284 unix_state_lock_nested(sk);
1286 if (sk->sk_state != st) {
1287 unix_state_unlock(sk);
1288 unix_state_unlock(other);
1289 sock_put(other);
1290 goto restart;
1293 err = security_unix_stream_connect(sk, other, newsk);
1294 if (err) {
1295 unix_state_unlock(sk);
1296 goto out_unlock;
1299 /* The way is open! Fastly set all the necessary fields... */
1301 sock_hold(sk);
1302 unix_peer(newsk) = sk;
1303 newsk->sk_state = TCP_ESTABLISHED;
1304 newsk->sk_type = sk->sk_type;
1305 init_peercred(newsk);
1306 newu = unix_sk(newsk);
1307 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1308 otheru = unix_sk(other);
1310 /* copy address information from listening to new sock*/
1311 if (otheru->addr) {
1312 atomic_inc(&otheru->addr->refcnt);
1313 newu->addr = otheru->addr;
1315 if (otheru->path.dentry) {
1316 path_get(&otheru->path);
1317 newu->path = otheru->path;
1320 /* Set credentials */
1321 copy_peercred(sk, other);
1323 sock->state = SS_CONNECTED;
1324 sk->sk_state = TCP_ESTABLISHED;
1325 sock_hold(newsk);
1327 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1328 unix_peer(sk) = newsk;
1330 unix_state_unlock(sk);
1332 /* take ten and and send info to listening sock */
1333 spin_lock(&other->sk_receive_queue.lock);
1334 __skb_queue_tail(&other->sk_receive_queue, skb);
1335 spin_unlock(&other->sk_receive_queue.lock);
1336 unix_state_unlock(other);
1337 other->sk_data_ready(other);
1338 sock_put(other);
1339 return 0;
1341 out_unlock:
1342 if (other)
1343 unix_state_unlock(other);
1345 out:
1346 kfree_skb(skb);
1347 if (newsk)
1348 unix_release_sock(newsk, 0);
1349 if (other)
1350 sock_put(other);
1351 return err;
1354 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1356 struct sock *ska = socka->sk, *skb = sockb->sk;
1358 /* Join our sockets back to back */
1359 sock_hold(ska);
1360 sock_hold(skb);
1361 unix_peer(ska) = skb;
1362 unix_peer(skb) = ska;
1363 init_peercred(ska);
1364 init_peercred(skb);
1366 if (ska->sk_type != SOCK_DGRAM) {
1367 ska->sk_state = TCP_ESTABLISHED;
1368 skb->sk_state = TCP_ESTABLISHED;
1369 socka->state = SS_CONNECTED;
1370 sockb->state = SS_CONNECTED;
1372 return 0;
1375 static void unix_sock_inherit_flags(const struct socket *old,
1376 struct socket *new)
1378 if (test_bit(SOCK_PASSCRED, &old->flags))
1379 set_bit(SOCK_PASSCRED, &new->flags);
1380 if (test_bit(SOCK_PASSSEC, &old->flags))
1381 set_bit(SOCK_PASSSEC, &new->flags);
1384 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1386 struct sock *sk = sock->sk;
1387 struct sock *tsk;
1388 struct sk_buff *skb;
1389 int err;
1391 err = -EOPNOTSUPP;
1392 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1393 goto out;
1395 err = -EINVAL;
1396 if (sk->sk_state != TCP_LISTEN)
1397 goto out;
1399 /* If socket state is TCP_LISTEN it cannot change (for now...),
1400 * so that no locks are necessary.
1403 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1404 if (!skb) {
1405 /* This means receive shutdown. */
1406 if (err == 0)
1407 err = -EINVAL;
1408 goto out;
1411 tsk = skb->sk;
1412 skb_free_datagram(sk, skb);
1413 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1415 /* attach accepted sock to socket */
1416 unix_state_lock(tsk);
1417 newsock->state = SS_CONNECTED;
1418 unix_sock_inherit_flags(sock, newsock);
1419 sock_graft(tsk, newsock);
1420 unix_state_unlock(tsk);
1421 return 0;
1423 out:
1424 return err;
1428 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1430 struct sock *sk = sock->sk;
1431 struct unix_sock *u;
1432 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1433 int err = 0;
1435 if (peer) {
1436 sk = unix_peer_get(sk);
1438 err = -ENOTCONN;
1439 if (!sk)
1440 goto out;
1441 err = 0;
1442 } else {
1443 sock_hold(sk);
1446 u = unix_sk(sk);
1447 unix_state_lock(sk);
1448 if (!u->addr) {
1449 sunaddr->sun_family = AF_UNIX;
1450 sunaddr->sun_path[0] = 0;
1451 *uaddr_len = sizeof(short);
1452 } else {
1453 struct unix_address *addr = u->addr;
1455 *uaddr_len = addr->len;
1456 memcpy(sunaddr, addr->name, *uaddr_len);
1458 unix_state_unlock(sk);
1459 sock_put(sk);
1460 out:
1461 return err;
1464 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1466 int i;
1468 scm->fp = UNIXCB(skb).fp;
1469 UNIXCB(skb).fp = NULL;
1471 for (i = scm->fp->count-1; i >= 0; i--)
1472 unix_notinflight(scm->fp->fp[i]);
1475 static void unix_destruct_scm(struct sk_buff *skb)
1477 struct scm_cookie scm;
1478 memset(&scm, 0, sizeof(scm));
1479 scm.pid = UNIXCB(skb).pid;
1480 if (UNIXCB(skb).fp)
1481 unix_detach_fds(&scm, skb);
1483 /* Alas, it calls VFS */
1484 /* So fscking what? fput() had been SMP-safe since the last Summer */
1485 scm_destroy(&scm);
1486 sock_wfree(skb);
1490 * The "user->unix_inflight" variable is protected by the garbage
1491 * collection lock, and we just read it locklessly here. If you go
1492 * over the limit, there might be a tiny race in actually noticing
1493 * it across threads. Tough.
1495 static inline bool too_many_unix_fds(struct task_struct *p)
1497 struct user_struct *user = current_user();
1499 if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
1500 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1501 return false;
1504 #define MAX_RECURSION_LEVEL 4
1506 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1508 int i;
1509 unsigned char max_level = 0;
1510 int unix_sock_count = 0;
1512 if (too_many_unix_fds(current))
1513 return -ETOOMANYREFS;
1515 for (i = scm->fp->count - 1; i >= 0; i--) {
1516 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1518 if (sk) {
1519 unix_sock_count++;
1520 max_level = max(max_level,
1521 unix_sk(sk)->recursion_level);
1524 if (unlikely(max_level > MAX_RECURSION_LEVEL))
1525 return -ETOOMANYREFS;
1528 * Need to duplicate file references for the sake of garbage
1529 * collection. Otherwise a socket in the fps might become a
1530 * candidate for GC while the skb is not yet queued.
1532 UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1533 if (!UNIXCB(skb).fp)
1534 return -ENOMEM;
1536 for (i = scm->fp->count - 1; i >= 0; i--)
1537 unix_inflight(scm->fp->fp[i]);
1538 return max_level;
1541 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1543 int err = 0;
1545 UNIXCB(skb).pid = get_pid(scm->pid);
1546 UNIXCB(skb).uid = scm->creds.uid;
1547 UNIXCB(skb).gid = scm->creds.gid;
1548 UNIXCB(skb).fp = NULL;
1549 if (scm->fp && send_fds)
1550 err = unix_attach_fds(scm, skb);
1552 skb->destructor = unix_destruct_scm;
1553 return err;
1557 * Some apps rely on write() giving SCM_CREDENTIALS
1558 * We include credentials if source or destination socket
1559 * asserted SOCK_PASSCRED.
1561 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1562 const struct sock *other)
1564 if (UNIXCB(skb).pid)
1565 return;
1566 if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1567 !other->sk_socket ||
1568 test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1569 UNIXCB(skb).pid = get_pid(task_tgid(current));
1570 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1575 * Send AF_UNIX data.
1578 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1579 struct msghdr *msg, size_t len)
1581 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1582 struct sock *sk = sock->sk;
1583 struct net *net = sock_net(sk);
1584 struct unix_sock *u = unix_sk(sk);
1585 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1586 struct sock *other = NULL;
1587 int namelen = 0; /* fake GCC */
1588 int err;
1589 unsigned int hash;
1590 struct sk_buff *skb;
1591 long timeo;
1592 struct scm_cookie tmp_scm;
1593 int max_level;
1594 int data_len = 0;
1595 int sk_locked;
1597 if (NULL == siocb->scm)
1598 siocb->scm = &tmp_scm;
1599 wait_for_unix_gc();
1600 err = scm_send(sock, msg, siocb->scm, false);
1601 if (err < 0)
1602 return err;
1604 err = -EOPNOTSUPP;
1605 if (msg->msg_flags&MSG_OOB)
1606 goto out;
1608 if (msg->msg_namelen) {
1609 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1610 if (err < 0)
1611 goto out;
1612 namelen = err;
1613 } else {
1614 sunaddr = NULL;
1615 err = -ENOTCONN;
1616 other = unix_peer_get(sk);
1617 if (!other)
1618 goto out;
1621 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1622 && (err = unix_autobind(sock)) != 0)
1623 goto out;
1625 err = -EMSGSIZE;
1626 if (len > sk->sk_sndbuf - 32)
1627 goto out;
1629 if (len > SKB_MAX_ALLOC) {
1630 data_len = min_t(size_t,
1631 len - SKB_MAX_ALLOC,
1632 MAX_SKB_FRAGS * PAGE_SIZE);
1633 data_len = PAGE_ALIGN(data_len);
1635 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1638 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1639 msg->msg_flags & MSG_DONTWAIT, &err,
1640 PAGE_ALLOC_COSTLY_ORDER);
1641 if (skb == NULL)
1642 goto out;
1644 err = unix_scm_to_skb(siocb->scm, skb, true);
1645 if (err < 0)
1646 goto out_free;
1647 max_level = err + 1;
1648 unix_get_secdata(siocb->scm, skb);
1650 skb_put(skb, len - data_len);
1651 skb->data_len = data_len;
1652 skb->len = len;
1653 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1654 if (err)
1655 goto out_free;
1657 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1659 restart:
1660 if (!other) {
1661 err = -ECONNRESET;
1662 if (sunaddr == NULL)
1663 goto out_free;
1665 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1666 hash, &err);
1667 if (other == NULL)
1668 goto out_free;
1671 if (sk_filter(other, skb) < 0) {
1672 /* Toss the packet but do not return any error to the sender */
1673 err = len;
1674 goto out_free;
1677 sk_locked = 0;
1678 unix_state_lock(other);
1679 restart_locked:
1680 err = -EPERM;
1681 if (!unix_may_send(sk, other))
1682 goto out_unlock;
1684 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1686 * Check with 1003.1g - what should
1687 * datagram error
1689 unix_state_unlock(other);
1690 sock_put(other);
1692 if (!sk_locked)
1693 unix_state_lock(sk);
1695 err = 0;
1696 if (unix_peer(sk) == other) {
1697 unix_peer(sk) = NULL;
1698 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1700 unix_state_unlock(sk);
1702 unix_dgram_disconnected(sk, other);
1703 sock_put(other);
1704 err = -ECONNREFUSED;
1705 } else {
1706 unix_state_unlock(sk);
1709 other = NULL;
1710 if (err)
1711 goto out_free;
1712 goto restart;
1715 err = -EPIPE;
1716 if (other->sk_shutdown & RCV_SHUTDOWN)
1717 goto out_unlock;
1719 if (sk->sk_type != SOCK_SEQPACKET) {
1720 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1721 if (err)
1722 goto out_unlock;
1725 /* other == sk && unix_peer(other) != sk if
1726 * - unix_peer(sk) == NULL, destination address bound to sk
1727 * - unix_peer(sk) == sk by time of get but disconnected before lock
1729 if (other != sk &&
1730 unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1731 if (timeo) {
1732 timeo = unix_wait_for_peer(other, timeo);
1734 err = sock_intr_errno(timeo);
1735 if (signal_pending(current))
1736 goto out_free;
1738 goto restart;
1741 if (!sk_locked) {
1742 unix_state_unlock(other);
1743 unix_state_double_lock(sk, other);
1746 if (unix_peer(sk) != other ||
1747 unix_dgram_peer_wake_me(sk, other)) {
1748 err = -EAGAIN;
1749 sk_locked = 1;
1750 goto out_unlock;
1753 if (!sk_locked) {
1754 sk_locked = 1;
1755 goto restart_locked;
1759 if (unlikely(sk_locked))
1760 unix_state_unlock(sk);
1762 if (sock_flag(other, SOCK_RCVTSTAMP))
1763 __net_timestamp(skb);
1764 maybe_add_creds(skb, sock, other);
1765 skb_queue_tail(&other->sk_receive_queue, skb);
1766 if (max_level > unix_sk(other)->recursion_level)
1767 unix_sk(other)->recursion_level = max_level;
1768 unix_state_unlock(other);
1769 other->sk_data_ready(other);
1770 sock_put(other);
1771 scm_destroy(siocb->scm);
1772 return len;
1774 out_unlock:
1775 if (sk_locked)
1776 unix_state_unlock(sk);
1777 unix_state_unlock(other);
1778 out_free:
1779 kfree_skb(skb);
1780 out:
1781 if (other)
1782 sock_put(other);
1783 scm_destroy(siocb->scm);
1784 return err;
1787 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1788 * bytes, and a minimun of a full page.
1790 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1792 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1793 struct msghdr *msg, size_t len)
1795 struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1796 struct sock *sk = sock->sk;
1797 struct sock *other = NULL;
1798 int err, size;
1799 struct sk_buff *skb;
1800 int sent = 0;
1801 struct scm_cookie tmp_scm;
1802 bool fds_sent = false;
1803 int max_level;
1804 int data_len;
1806 if (NULL == siocb->scm)
1807 siocb->scm = &tmp_scm;
1808 wait_for_unix_gc();
1809 err = scm_send(sock, msg, siocb->scm, false);
1810 if (err < 0)
1811 return err;
1813 err = -EOPNOTSUPP;
1814 if (msg->msg_flags&MSG_OOB)
1815 goto out_err;
1817 if (msg->msg_namelen) {
1818 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1819 goto out_err;
1820 } else {
1821 err = -ENOTCONN;
1822 other = unix_peer(sk);
1823 if (!other)
1824 goto out_err;
1827 if (sk->sk_shutdown & SEND_SHUTDOWN)
1828 goto pipe_err;
1830 while (sent < len) {
1831 size = len - sent;
1833 /* Keep two messages in the pipe so it schedules better */
1834 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1836 /* allow fallback to order-0 allocations */
1837 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1839 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1841 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1843 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1844 msg->msg_flags & MSG_DONTWAIT, &err,
1845 get_order(UNIX_SKB_FRAGS_SZ));
1846 if (!skb)
1847 goto out_err;
1849 /* Only send the fds in the first buffer */
1850 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1851 if (err < 0) {
1852 kfree_skb(skb);
1853 goto out_err;
1855 max_level = err + 1;
1856 fds_sent = true;
1858 skb_put(skb, size - data_len);
1859 skb->data_len = data_len;
1860 skb->len = size;
1861 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1862 sent, size);
1863 if (err) {
1864 kfree_skb(skb);
1865 goto out_err;
1868 unix_state_lock(other);
1870 if (sock_flag(other, SOCK_DEAD) ||
1871 (other->sk_shutdown & RCV_SHUTDOWN))
1872 goto pipe_err_free;
1874 maybe_add_creds(skb, sock, other);
1875 skb_queue_tail(&other->sk_receive_queue, skb);
1876 if (max_level > unix_sk(other)->recursion_level)
1877 unix_sk(other)->recursion_level = max_level;
1878 unix_state_unlock(other);
1879 other->sk_data_ready(other);
1880 sent += size;
1883 scm_destroy(siocb->scm);
1884 siocb->scm = NULL;
1886 return sent;
1888 pipe_err_free:
1889 unix_state_unlock(other);
1890 kfree_skb(skb);
1891 pipe_err:
1892 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1893 send_sig(SIGPIPE, current, 0);
1894 err = -EPIPE;
1895 out_err:
1896 scm_destroy(siocb->scm);
1897 siocb->scm = NULL;
1898 return sent ? : err;
1901 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1902 struct msghdr *msg, size_t len)
1904 int err;
1905 struct sock *sk = sock->sk;
1907 err = sock_error(sk);
1908 if (err)
1909 return err;
1911 if (sk->sk_state != TCP_ESTABLISHED)
1912 return -ENOTCONN;
1914 if (msg->msg_namelen)
1915 msg->msg_namelen = 0;
1917 return unix_dgram_sendmsg(kiocb, sock, msg, len);
1920 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1921 struct msghdr *msg, size_t size,
1922 int flags)
1924 struct sock *sk = sock->sk;
1926 if (sk->sk_state != TCP_ESTABLISHED)
1927 return -ENOTCONN;
1929 return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1932 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1934 struct unix_sock *u = unix_sk(sk);
1936 if (u->addr) {
1937 msg->msg_namelen = u->addr->len;
1938 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1942 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1943 struct msghdr *msg, size_t size,
1944 int flags)
1946 struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1947 struct scm_cookie tmp_scm;
1948 struct sock *sk = sock->sk;
1949 struct unix_sock *u = unix_sk(sk);
1950 int noblock = flags & MSG_DONTWAIT;
1951 struct sk_buff *skb;
1952 int err;
1953 int peeked, skip;
1955 err = -EOPNOTSUPP;
1956 if (flags&MSG_OOB)
1957 goto out;
1959 err = mutex_lock_interruptible(&u->readlock);
1960 if (unlikely(err)) {
1961 /* recvmsg() in non blocking mode is supposed to return -EAGAIN
1962 * sk_rcvtimeo is not honored by mutex_lock_interruptible()
1964 err = noblock ? -EAGAIN : -ERESTARTSYS;
1965 goto out;
1968 skip = sk_peek_offset(sk, flags);
1970 skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1971 if (!skb) {
1972 unix_state_lock(sk);
1973 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1974 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1975 (sk->sk_shutdown & RCV_SHUTDOWN))
1976 err = 0;
1977 unix_state_unlock(sk);
1978 goto out_unlock;
1981 wake_up_interruptible_sync_poll(&u->peer_wait,
1982 POLLOUT | POLLWRNORM | POLLWRBAND);
1984 if (msg->msg_name)
1985 unix_copy_addr(msg, skb->sk);
1987 if (size > skb->len - skip)
1988 size = skb->len - skip;
1989 else if (size < skb->len - skip)
1990 msg->msg_flags |= MSG_TRUNC;
1992 err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1993 if (err)
1994 goto out_free;
1996 if (sock_flag(sk, SOCK_RCVTSTAMP))
1997 __sock_recv_timestamp(msg, sk, skb);
1999 if (!siocb->scm) {
2000 siocb->scm = &tmp_scm;
2001 memset(&tmp_scm, 0, sizeof(tmp_scm));
2003 scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2004 unix_set_secdata(siocb->scm, skb);
2006 if (!(flags & MSG_PEEK)) {
2007 if (UNIXCB(skb).fp)
2008 unix_detach_fds(siocb->scm, skb);
2010 sk_peek_offset_bwd(sk, skb->len);
2011 } else {
2012 /* It is questionable: on PEEK we could:
2013 - do not return fds - good, but too simple 8)
2014 - return fds, and do not return them on read (old strategy,
2015 apparently wrong)
2016 - clone fds (I chose it for now, it is the most universal
2017 solution)
2019 POSIX 1003.1g does not actually define this clearly
2020 at all. POSIX 1003.1g doesn't define a lot of things
2021 clearly however!
2025 sk_peek_offset_fwd(sk, size);
2027 if (UNIXCB(skb).fp)
2028 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2030 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2032 scm_recv(sock, msg, siocb->scm, flags);
2034 out_free:
2035 skb_free_datagram(sk, skb);
2036 out_unlock:
2037 mutex_unlock(&u->readlock);
2038 out:
2039 return err;
2043 * Sleep until more data has arrived. But check for races..
2045 static long unix_stream_data_wait(struct sock *sk, long timeo,
2046 struct sk_buff *last)
2048 DEFINE_WAIT(wait);
2050 unix_state_lock(sk);
2052 for (;;) {
2053 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2055 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
2056 sk->sk_err ||
2057 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2058 signal_pending(current) ||
2059 !timeo)
2060 break;
2062 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2063 unix_state_unlock(sk);
2064 timeo = freezable_schedule_timeout(timeo);
2065 unix_state_lock(sk);
2067 if (sock_flag(sk, SOCK_DEAD))
2068 break;
2070 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
2073 finish_wait(sk_sleep(sk), &wait);
2074 unix_state_unlock(sk);
2075 return timeo;
2078 static unsigned int unix_skb_len(const struct sk_buff *skb)
2080 return skb->len - UNIXCB(skb).consumed;
2083 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
2084 struct msghdr *msg, size_t size,
2085 int flags)
2087 struct sock_iocb *siocb = kiocb_to_siocb(iocb);
2088 struct scm_cookie tmp_scm;
2089 struct sock *sk = sock->sk;
2090 struct unix_sock *u = unix_sk(sk);
2091 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
2092 int copied = 0;
2093 int noblock = flags & MSG_DONTWAIT;
2094 int check_creds = 0;
2095 int target;
2096 int err = 0;
2097 long timeo;
2098 int skip;
2100 err = -EINVAL;
2101 if (sk->sk_state != TCP_ESTABLISHED)
2102 goto out;
2104 err = -EOPNOTSUPP;
2105 if (flags&MSG_OOB)
2106 goto out;
2108 target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
2109 timeo = sock_rcvtimeo(sk, noblock);
2111 /* Lock the socket to prevent queue disordering
2112 * while sleeps in memcpy_tomsg
2115 if (!siocb->scm) {
2116 siocb->scm = &tmp_scm;
2117 memset(&tmp_scm, 0, sizeof(tmp_scm));
2120 mutex_lock(&u->readlock);
2122 if (flags & MSG_PEEK)
2123 skip = sk_peek_offset(sk, flags);
2124 else
2125 skip = 0;
2127 do {
2128 int chunk;
2129 struct sk_buff *skb, *last;
2131 unix_state_lock(sk);
2132 if (sock_flag(sk, SOCK_DEAD)) {
2133 err = -ECONNRESET;
2134 goto unlock;
2136 last = skb = skb_peek(&sk->sk_receive_queue);
2137 again:
2138 if (skb == NULL) {
2139 unix_sk(sk)->recursion_level = 0;
2140 if (copied >= target)
2141 goto unlock;
2144 * POSIX 1003.1g mandates this order.
2147 err = sock_error(sk);
2148 if (err)
2149 goto unlock;
2150 if (sk->sk_shutdown & RCV_SHUTDOWN)
2151 goto unlock;
2153 unix_state_unlock(sk);
2154 err = -EAGAIN;
2155 if (!timeo)
2156 break;
2157 mutex_unlock(&u->readlock);
2159 timeo = unix_stream_data_wait(sk, timeo, last);
2161 if (signal_pending(current)) {
2162 err = sock_intr_errno(timeo);
2163 goto out;
2166 mutex_lock(&u->readlock);
2167 continue;
2168 unlock:
2169 unix_state_unlock(sk);
2170 break;
2173 while (skip >= unix_skb_len(skb)) {
2174 skip -= unix_skb_len(skb);
2175 last = skb;
2176 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2177 if (!skb)
2178 goto again;
2181 unix_state_unlock(sk);
2183 if (check_creds) {
2184 /* Never glue messages from different writers */
2185 if ((UNIXCB(skb).pid != siocb->scm->pid) ||
2186 !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2187 !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2188 break;
2189 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2190 /* Copy credentials */
2191 scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2192 check_creds = 1;
2195 /* Copy address just once */
2196 if (sunaddr) {
2197 unix_copy_addr(msg, skb->sk);
2198 sunaddr = NULL;
2201 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2202 if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2203 msg->msg_iov, chunk)) {
2204 if (copied == 0)
2205 copied = -EFAULT;
2206 break;
2208 copied += chunk;
2209 size -= chunk;
2211 /* Mark read part of skb as used */
2212 if (!(flags & MSG_PEEK)) {
2213 UNIXCB(skb).consumed += chunk;
2215 sk_peek_offset_bwd(sk, chunk);
2217 if (UNIXCB(skb).fp)
2218 unix_detach_fds(siocb->scm, skb);
2220 if (unix_skb_len(skb))
2221 break;
2223 skb_unlink(skb, &sk->sk_receive_queue);
2224 consume_skb(skb);
2226 if (siocb->scm->fp)
2227 break;
2228 } else {
2229 /* It is questionable, see note in unix_dgram_recvmsg.
2231 if (UNIXCB(skb).fp)
2232 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2234 sk_peek_offset_fwd(sk, chunk);
2236 if (UNIXCB(skb).fp)
2237 break;
2239 skip = 0;
2240 last = skb;
2241 unix_state_lock(sk);
2242 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2243 if (skb)
2244 goto again;
2245 unix_state_unlock(sk);
2246 break;
2248 } while (size);
2250 mutex_unlock(&u->readlock);
2251 scm_recv(sock, msg, siocb->scm, flags);
2252 out:
2253 return copied ? : err;
2256 static int unix_shutdown(struct socket *sock, int mode)
2258 struct sock *sk = sock->sk;
2259 struct sock *other;
2261 if (mode < SHUT_RD || mode > SHUT_RDWR)
2262 return -EINVAL;
2263 /* This maps:
2264 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2265 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2266 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2268 ++mode;
2270 unix_state_lock(sk);
2271 sk->sk_shutdown |= mode;
2272 other = unix_peer(sk);
2273 if (other)
2274 sock_hold(other);
2275 unix_state_unlock(sk);
2276 sk->sk_state_change(sk);
2278 if (other &&
2279 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2281 int peer_mode = 0;
2283 if (mode&RCV_SHUTDOWN)
2284 peer_mode |= SEND_SHUTDOWN;
2285 if (mode&SEND_SHUTDOWN)
2286 peer_mode |= RCV_SHUTDOWN;
2287 unix_state_lock(other);
2288 other->sk_shutdown |= peer_mode;
2289 unix_state_unlock(other);
2290 other->sk_state_change(other);
2291 if (peer_mode == SHUTDOWN_MASK)
2292 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2293 else if (peer_mode & RCV_SHUTDOWN)
2294 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2296 if (other)
2297 sock_put(other);
2299 return 0;
2302 long unix_inq_len(struct sock *sk)
2304 struct sk_buff *skb;
2305 long amount = 0;
2307 if (sk->sk_state == TCP_LISTEN)
2308 return -EINVAL;
2310 spin_lock(&sk->sk_receive_queue.lock);
2311 if (sk->sk_type == SOCK_STREAM ||
2312 sk->sk_type == SOCK_SEQPACKET) {
2313 skb_queue_walk(&sk->sk_receive_queue, skb)
2314 amount += unix_skb_len(skb);
2315 } else {
2316 skb = skb_peek(&sk->sk_receive_queue);
2317 if (skb)
2318 amount = skb->len;
2320 spin_unlock(&sk->sk_receive_queue.lock);
2322 return amount;
2324 EXPORT_SYMBOL_GPL(unix_inq_len);
2326 long unix_outq_len(struct sock *sk)
2328 return sk_wmem_alloc_get(sk);
2330 EXPORT_SYMBOL_GPL(unix_outq_len);
2332 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2334 struct sock *sk = sock->sk;
2335 long amount = 0;
2336 int err;
2338 switch (cmd) {
2339 case SIOCOUTQ:
2340 amount = unix_outq_len(sk);
2341 err = put_user(amount, (int __user *)arg);
2342 break;
2343 case SIOCINQ:
2344 amount = unix_inq_len(sk);
2345 if (amount < 0)
2346 err = amount;
2347 else
2348 err = put_user(amount, (int __user *)arg);
2349 break;
2350 default:
2351 err = -ENOIOCTLCMD;
2352 break;
2354 return err;
2357 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2359 struct sock *sk = sock->sk;
2360 unsigned int mask;
2362 sock_poll_wait(file, sk_sleep(sk), wait);
2363 mask = 0;
2365 /* exceptional events? */
2366 if (sk->sk_err)
2367 mask |= POLLERR;
2368 if (sk->sk_shutdown == SHUTDOWN_MASK)
2369 mask |= POLLHUP;
2370 if (sk->sk_shutdown & RCV_SHUTDOWN)
2371 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2373 /* readable? */
2374 if (!skb_queue_empty(&sk->sk_receive_queue))
2375 mask |= POLLIN | POLLRDNORM;
2377 /* Connection-based need to check for termination and startup */
2378 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2379 sk->sk_state == TCP_CLOSE)
2380 mask |= POLLHUP;
2383 * we set writable also when the other side has shut down the
2384 * connection. This prevents stuck sockets.
2386 if (unix_writable(sk))
2387 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2389 return mask;
2392 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2393 poll_table *wait)
2395 struct sock *sk = sock->sk, *other;
2396 unsigned int mask, writable;
2398 sock_poll_wait(file, sk_sleep(sk), wait);
2399 mask = 0;
2401 /* exceptional events? */
2402 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2403 mask |= POLLERR |
2404 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2406 if (sk->sk_shutdown & RCV_SHUTDOWN)
2407 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2408 if (sk->sk_shutdown == SHUTDOWN_MASK)
2409 mask |= POLLHUP;
2411 /* readable? */
2412 if (!skb_queue_empty(&sk->sk_receive_queue))
2413 mask |= POLLIN | POLLRDNORM;
2415 /* Connection-based need to check for termination and startup */
2416 if (sk->sk_type == SOCK_SEQPACKET) {
2417 if (sk->sk_state == TCP_CLOSE)
2418 mask |= POLLHUP;
2419 /* connection hasn't started yet? */
2420 if (sk->sk_state == TCP_SYN_SENT)
2421 return mask;
2424 /* No write status requested, avoid expensive OUT tests. */
2425 if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2426 return mask;
2428 writable = unix_writable(sk);
2429 if (writable) {
2430 unix_state_lock(sk);
2432 other = unix_peer(sk);
2433 if (other && unix_peer(other) != sk &&
2434 unix_recvq_full(other) &&
2435 unix_dgram_peer_wake_me(sk, other))
2436 writable = 0;
2438 unix_state_unlock(sk);
2441 if (writable)
2442 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2443 else
2444 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2446 return mask;
2449 #ifdef CONFIG_PROC_FS
2451 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2453 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2454 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2455 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2457 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2459 unsigned long offset = get_offset(*pos);
2460 unsigned long bucket = get_bucket(*pos);
2461 struct sock *sk;
2462 unsigned long count = 0;
2464 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2465 if (sock_net(sk) != seq_file_net(seq))
2466 continue;
2467 if (++count == offset)
2468 break;
2471 return sk;
2474 static struct sock *unix_next_socket(struct seq_file *seq,
2475 struct sock *sk,
2476 loff_t *pos)
2478 unsigned long bucket;
2480 while (sk > (struct sock *)SEQ_START_TOKEN) {
2481 sk = sk_next(sk);
2482 if (!sk)
2483 goto next_bucket;
2484 if (sock_net(sk) == seq_file_net(seq))
2485 return sk;
2488 do {
2489 sk = unix_from_bucket(seq, pos);
2490 if (sk)
2491 return sk;
2493 next_bucket:
2494 bucket = get_bucket(*pos) + 1;
2495 *pos = set_bucket_offset(bucket, 1);
2496 } while (bucket < ARRAY_SIZE(unix_socket_table));
2498 return NULL;
2501 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2502 __acquires(unix_table_lock)
2504 spin_lock(&unix_table_lock);
2506 if (!*pos)
2507 return SEQ_START_TOKEN;
2509 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2510 return NULL;
2512 return unix_next_socket(seq, NULL, pos);
2515 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2517 ++*pos;
2518 return unix_next_socket(seq, v, pos);
2521 static void unix_seq_stop(struct seq_file *seq, void *v)
2522 __releases(unix_table_lock)
2524 spin_unlock(&unix_table_lock);
2527 static int unix_seq_show(struct seq_file *seq, void *v)
2530 if (v == SEQ_START_TOKEN)
2531 seq_puts(seq, "Num RefCount Protocol Flags Type St "
2532 "Inode Path\n");
2533 else {
2534 struct sock *s = v;
2535 struct unix_sock *u = unix_sk(s);
2536 unix_state_lock(s);
2538 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2540 atomic_read(&s->sk_refcnt),
2542 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2543 s->sk_type,
2544 s->sk_socket ?
2545 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2546 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2547 sock_i_ino(s));
2549 if (u->addr) {
2550 int i, len;
2551 seq_putc(seq, ' ');
2553 i = 0;
2554 len = u->addr->len - sizeof(short);
2555 if (!UNIX_ABSTRACT(s))
2556 len--;
2557 else {
2558 seq_putc(seq, '@');
2559 i++;
2561 for ( ; i < len; i++)
2562 seq_putc(seq, u->addr->name->sun_path[i]);
2564 unix_state_unlock(s);
2565 seq_putc(seq, '\n');
2568 return 0;
2571 static const struct seq_operations unix_seq_ops = {
2572 .start = unix_seq_start,
2573 .next = unix_seq_next,
2574 .stop = unix_seq_stop,
2575 .show = unix_seq_show,
2578 static int unix_seq_open(struct inode *inode, struct file *file)
2580 return seq_open_net(inode, file, &unix_seq_ops,
2581 sizeof(struct seq_net_private));
2584 static const struct file_operations unix_seq_fops = {
2585 .owner = THIS_MODULE,
2586 .open = unix_seq_open,
2587 .read = seq_read,
2588 .llseek = seq_lseek,
2589 .release = seq_release_net,
2592 #endif
2594 static const struct net_proto_family unix_family_ops = {
2595 .family = PF_UNIX,
2596 .create = unix_create,
2597 .owner = THIS_MODULE,
2601 static int __net_init unix_net_init(struct net *net)
2603 int error = -ENOMEM;
2605 net->unx.sysctl_max_dgram_qlen = 10;
2606 if (unix_sysctl_register(net))
2607 goto out;
2609 #ifdef CONFIG_PROC_FS
2610 if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2611 unix_sysctl_unregister(net);
2612 goto out;
2614 #endif
2615 error = 0;
2616 out:
2617 return error;
2620 static void __net_exit unix_net_exit(struct net *net)
2622 unix_sysctl_unregister(net);
2623 remove_proc_entry("unix", net->proc_net);
2626 static struct pernet_operations unix_net_ops = {
2627 .init = unix_net_init,
2628 .exit = unix_net_exit,
2631 static int __init af_unix_init(void)
2633 int rc = -1;
2635 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2637 rc = proto_register(&unix_proto, 1);
2638 if (rc != 0) {
2639 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2640 goto out;
2643 sock_register(&unix_family_ops);
2644 register_pernet_subsys(&unix_net_ops);
2645 out:
2646 return rc;
2649 static void __exit af_unix_exit(void)
2651 sock_unregister(PF_UNIX);
2652 proto_unregister(&unix_proto);
2653 unregister_pernet_subsys(&unix_net_ops);
2656 /* Earlier than device_initcall() so that other drivers invoking
2657 request_module() don't end up in a loop when modprobe tries
2658 to use a UNIX socket. But later than subsys_initcall() because
2659 we depend on stuff initialised there */
2660 fs_initcall(af_unix_init);
2661 module_exit(af_unix_exit);
2663 MODULE_LICENSE("GPL");
2664 MODULE_ALIAS_NETPROTO(PF_UNIX);