1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET3: Garbage Collector For AF_UNIX sockets
6 * Copyright (C) Barak A. Pearlmutter.
8 * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem.
9 * If it doesn't work blame me, it worked when Barak sent it.
16 * Current optimizations:
18 * - explicit stack instead of recursion
19 * - tail recurse on first born instead of immediate push/pop
20 * - we gather the stuff that should not be killed into tree
21 * and stack is just a path from root to the current pointer.
23 * Future optimizations:
25 * - don't just push entire root set; process in place
28 * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed.
29 * Cope with changing max_files.
31 * Graph may have cycles. That is, we can send the descriptor
32 * of foo to bar and vice versa. Current code chokes on that.
33 * Fix: move SCM_RIGHTS ones into the separate list and then
34 * skb_free() them all instead of doing explicit fput's.
35 * Another problem: since fput() may block somebody may
36 * create a new unix_socket when we are in the middle of sweep
37 * phase. Fix: revert the logic wrt MARKED. Mark everything
38 * upon the beginning and unmark non-junk ones.
40 * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS
41 * sent to connect()'ed but still not accept()'ed sockets.
42 * Fixed. Old code had slightly different problem here:
43 * extra fput() in situation when we passed the descriptor via
44 * such socket and closed it (descriptor). That would happen on
45 * each unix_gc() until the accept(). Since the struct file in
46 * question would go to the free list and might be reused...
47 * That might be the reason of random oopses on filp_close()
48 * in unrelated processes.
51 * Kill the explicit allocation of stack. Now we keep the tree
52 * with root in dummy + pointer (gc_current) to one of the nodes.
53 * Stack is represented as path from gc_current to dummy. Unmark
54 * now means "add to tree". Push == "make it a son of gc_current".
55 * Pop == "move gc_current to parent". We keep only pointers to
56 * parents (->gc_tree).
58 * Damn. Added missing check for ->dead in listen queues scanning.
60 * Miklos Szeredi 25 Jun 2007
61 * Reimplement with a cycle collecting algorithm. This should
62 * solve several problems with the previous code, like being racy
63 * wrt receive and holding up unrelated socket operations.
66 #include <linux/kernel.h>
67 #include <linux/string.h>
68 #include <linux/socket.h>
70 #include <linux/net.h>
72 #include <linux/skbuff.h>
73 #include <linux/netdevice.h>
74 #include <linux/file.h>
75 #include <linux/proc_fs.h>
76 #include <linux/mutex.h>
77 #include <linux/wait.h>
80 #include <net/af_unix.h>
82 #include <net/tcp_states.h>
84 struct unix_sock
*unix_get_socket(struct file
*filp
)
86 struct inode
*inode
= file_inode(filp
);
89 if (S_ISSOCK(inode
->i_mode
) && !(filp
->f_mode
& FMODE_PATH
)) {
90 struct socket
*sock
= SOCKET_I(inode
);
91 const struct proto_ops
*ops
;
92 struct sock
*sk
= sock
->sk
;
94 ops
= READ_ONCE(sock
->ops
);
97 if (sk
&& ops
&& ops
->family
== PF_UNIX
)
104 static struct unix_vertex
*unix_edge_successor(struct unix_edge
*edge
)
106 /* If an embryo socket has a fd,
107 * the listener indirectly holds the fd's refcnt.
109 if (edge
->successor
->listener
)
110 return unix_sk(edge
->successor
->listener
)->vertex
;
112 return edge
->successor
->vertex
;
115 static bool unix_graph_maybe_cyclic
;
116 static bool unix_graph_grouped
;
118 static void unix_update_graph(struct unix_vertex
*vertex
)
120 /* If the receiver socket is not inflight, no cyclic
121 * reference could be formed.
126 unix_graph_maybe_cyclic
= true;
127 unix_graph_grouped
= false;
130 static LIST_HEAD(unix_unvisited_vertices
);
132 enum unix_vertex_index
{
133 UNIX_VERTEX_INDEX_MARK1
,
134 UNIX_VERTEX_INDEX_MARK2
,
135 UNIX_VERTEX_INDEX_START
,
138 static unsigned long unix_vertex_unvisited_index
= UNIX_VERTEX_INDEX_MARK1
;
140 static void unix_add_edge(struct scm_fp_list
*fpl
, struct unix_edge
*edge
)
142 struct unix_vertex
*vertex
= edge
->predecessor
->vertex
;
145 vertex
= list_first_entry(&fpl
->vertices
, typeof(*vertex
), entry
);
146 vertex
->index
= unix_vertex_unvisited_index
;
147 vertex
->out_degree
= 0;
148 INIT_LIST_HEAD(&vertex
->edges
);
149 INIT_LIST_HEAD(&vertex
->scc_entry
);
151 list_move_tail(&vertex
->entry
, &unix_unvisited_vertices
);
152 edge
->predecessor
->vertex
= vertex
;
155 vertex
->out_degree
++;
156 list_add_tail(&edge
->vertex_entry
, &vertex
->edges
);
158 unix_update_graph(unix_edge_successor(edge
));
161 static void unix_del_edge(struct scm_fp_list
*fpl
, struct unix_edge
*edge
)
163 struct unix_vertex
*vertex
= edge
->predecessor
->vertex
;
166 unix_update_graph(unix_edge_successor(edge
));
168 list_del(&edge
->vertex_entry
);
169 vertex
->out_degree
--;
171 if (!vertex
->out_degree
) {
172 edge
->predecessor
->vertex
= NULL
;
173 list_move_tail(&vertex
->entry
, &fpl
->vertices
);
177 static void unix_free_vertices(struct scm_fp_list
*fpl
)
179 struct unix_vertex
*vertex
, *next_vertex
;
181 list_for_each_entry_safe(vertex
, next_vertex
, &fpl
->vertices
, entry
) {
182 list_del(&vertex
->entry
);
187 static DEFINE_SPINLOCK(unix_gc_lock
);
188 unsigned int unix_tot_inflight
;
190 void unix_add_edges(struct scm_fp_list
*fpl
, struct unix_sock
*receiver
)
194 spin_lock(&unix_gc_lock
);
196 if (!fpl
->count_unix
)
200 struct unix_sock
*inflight
= unix_get_socket(fpl
->fp
[j
++]);
201 struct unix_edge
*edge
;
206 edge
= fpl
->edges
+ i
++;
207 edge
->predecessor
= inflight
;
208 edge
->successor
= receiver
;
210 unix_add_edge(fpl
, edge
);
211 } while (i
< fpl
->count_unix
);
213 receiver
->scm_stat
.nr_unix_fds
+= fpl
->count_unix
;
214 WRITE_ONCE(unix_tot_inflight
, unix_tot_inflight
+ fpl
->count_unix
);
216 WRITE_ONCE(fpl
->user
->unix_inflight
, fpl
->user
->unix_inflight
+ fpl
->count
);
218 spin_unlock(&unix_gc_lock
);
220 fpl
->inflight
= true;
222 unix_free_vertices(fpl
);
225 void unix_del_edges(struct scm_fp_list
*fpl
)
227 struct unix_sock
*receiver
;
230 spin_lock(&unix_gc_lock
);
232 if (!fpl
->count_unix
)
236 struct unix_edge
*edge
= fpl
->edges
+ i
++;
238 unix_del_edge(fpl
, edge
);
239 } while (i
< fpl
->count_unix
);
242 receiver
= fpl
->edges
[0].successor
;
243 receiver
->scm_stat
.nr_unix_fds
-= fpl
->count_unix
;
245 WRITE_ONCE(unix_tot_inflight
, unix_tot_inflight
- fpl
->count_unix
);
247 WRITE_ONCE(fpl
->user
->unix_inflight
, fpl
->user
->unix_inflight
- fpl
->count
);
249 spin_unlock(&unix_gc_lock
);
251 fpl
->inflight
= false;
254 void unix_update_edges(struct unix_sock
*receiver
)
256 /* nr_unix_fds is only updated under unix_state_lock().
257 * If it's 0 here, the embryo socket is not part of the
258 * inflight graph, and GC will not see it, so no lock needed.
260 if (!receiver
->scm_stat
.nr_unix_fds
) {
261 receiver
->listener
= NULL
;
263 spin_lock(&unix_gc_lock
);
264 unix_update_graph(unix_sk(receiver
->listener
)->vertex
);
265 receiver
->listener
= NULL
;
266 spin_unlock(&unix_gc_lock
);
270 int unix_prepare_fpl(struct scm_fp_list
*fpl
)
272 struct unix_vertex
*vertex
;
275 if (!fpl
->count_unix
)
278 for (i
= 0; i
< fpl
->count_unix
; i
++) {
279 vertex
= kmalloc(sizeof(*vertex
), GFP_KERNEL
);
283 list_add(&vertex
->entry
, &fpl
->vertices
);
286 fpl
->edges
= kvmalloc_array(fpl
->count_unix
, sizeof(*fpl
->edges
),
294 unix_free_vertices(fpl
);
298 void unix_destroy_fpl(struct scm_fp_list
*fpl
)
304 unix_free_vertices(fpl
);
307 static bool unix_vertex_dead(struct unix_vertex
*vertex
)
309 struct unix_edge
*edge
;
313 list_for_each_entry(edge
, &vertex
->edges
, vertex_entry
) {
314 struct unix_vertex
*next_vertex
= unix_edge_successor(edge
);
316 /* The vertex's fd can be received by a non-inflight socket. */
320 /* The vertex's fd can be received by an inflight socket in
323 if (next_vertex
->scc_index
!= vertex
->scc_index
)
327 /* No receiver exists out of the same SCC. */
329 edge
= list_first_entry(&vertex
->edges
, typeof(*edge
), vertex_entry
);
330 u
= edge
->predecessor
;
331 total_ref
= file_count(u
->sk
.sk_socket
->file
);
333 /* If not close()d, total_ref > out_degree. */
334 if (total_ref
!= vertex
->out_degree
)
340 static void unix_collect_skb(struct list_head
*scc
, struct sk_buff_head
*hitlist
)
342 struct unix_vertex
*vertex
;
344 list_for_each_entry_reverse(vertex
, scc
, scc_entry
) {
345 struct sk_buff_head
*queue
;
346 struct unix_edge
*edge
;
349 edge
= list_first_entry(&vertex
->edges
, typeof(*edge
), vertex_entry
);
350 u
= edge
->predecessor
;
351 queue
= &u
->sk
.sk_receive_queue
;
353 spin_lock(&queue
->lock
);
355 if (u
->sk
.sk_state
== TCP_LISTEN
) {
358 skb_queue_walk(queue
, skb
) {
359 struct sk_buff_head
*embryo_queue
= &skb
->sk
->sk_receive_queue
;
361 spin_lock(&embryo_queue
->lock
);
362 skb_queue_splice_init(embryo_queue
, hitlist
);
363 spin_unlock(&embryo_queue
->lock
);
366 skb_queue_splice_init(queue
, hitlist
);
369 spin_unlock(&queue
->lock
);
373 static bool unix_scc_cyclic(struct list_head
*scc
)
375 struct unix_vertex
*vertex
;
376 struct unix_edge
*edge
;
378 /* SCC containing multiple vertices ? */
379 if (!list_is_singular(scc
))
382 vertex
= list_first_entry(scc
, typeof(*vertex
), scc_entry
);
384 /* Self-reference or a embryo-listener circle ? */
385 list_for_each_entry(edge
, &vertex
->edges
, vertex_entry
) {
386 if (unix_edge_successor(edge
) == vertex
)
393 static LIST_HEAD(unix_visited_vertices
);
394 static unsigned long unix_vertex_grouped_index
= UNIX_VERTEX_INDEX_MARK2
;
396 static void __unix_walk_scc(struct unix_vertex
*vertex
, unsigned long *last_index
,
397 struct sk_buff_head
*hitlist
)
399 LIST_HEAD(vertex_stack
);
400 struct unix_edge
*edge
;
401 LIST_HEAD(edge_stack
);
404 /* Push vertex to vertex_stack and mark it as on-stack
405 * (index >= UNIX_VERTEX_INDEX_START).
406 * The vertex will be popped when finalising SCC later.
408 list_add(&vertex
->scc_entry
, &vertex_stack
);
410 vertex
->index
= *last_index
;
411 vertex
->scc_index
= *last_index
;
414 /* Explore neighbour vertices (receivers of the current vertex's fd). */
415 list_for_each_entry(edge
, &vertex
->edges
, vertex_entry
) {
416 struct unix_vertex
*next_vertex
= unix_edge_successor(edge
);
421 if (next_vertex
->index
== unix_vertex_unvisited_index
) {
422 /* Iterative deepening depth first search
424 * 1. Push a forward edge to edge_stack and set
425 * the successor to vertex for the next iteration.
427 list_add(&edge
->stack_entry
, &edge_stack
);
429 vertex
= next_vertex
;
432 /* 2. Pop the edge directed to the current vertex
433 * and restore the ancestor for backtracking.
436 edge
= list_first_entry(&edge_stack
, typeof(*edge
), stack_entry
);
437 list_del_init(&edge
->stack_entry
);
439 next_vertex
= vertex
;
440 vertex
= edge
->predecessor
->vertex
;
442 /* If the successor has a smaller scc_index, two vertices
443 * are in the same SCC, so propagate the smaller scc_index
444 * to skip SCC finalisation.
446 vertex
->scc_index
= min(vertex
->scc_index
, next_vertex
->scc_index
);
447 } else if (next_vertex
->index
!= unix_vertex_grouped_index
) {
448 /* Loop detected by a back/cross edge.
450 * The successor is on vertex_stack, so two vertices are in
451 * the same SCC. If the successor has a smaller *scc_index*,
452 * propagate it to skip SCC finalisation.
454 vertex
->scc_index
= min(vertex
->scc_index
, next_vertex
->scc_index
);
456 /* The successor was already grouped as another SCC */
460 if (vertex
->index
== vertex
->scc_index
) {
461 struct unix_vertex
*v
;
462 struct list_head scc
;
463 bool scc_dead
= true;
467 * If the scc_index was not updated, all the vertices above on
468 * vertex_stack are in the same SCC. Group them using scc_entry.
470 __list_cut_position(&scc
, &vertex_stack
, &vertex
->scc_entry
);
472 list_for_each_entry_reverse(v
, &scc
, scc_entry
) {
473 /* Don't restart DFS from this vertex in unix_walk_scc(). */
474 list_move_tail(&v
->entry
, &unix_visited_vertices
);
476 /* Mark vertex as off-stack. */
477 v
->index
= unix_vertex_grouped_index
;
480 scc_dead
= unix_vertex_dead(v
);
484 unix_collect_skb(&scc
, hitlist
);
485 else if (!unix_graph_maybe_cyclic
)
486 unix_graph_maybe_cyclic
= unix_scc_cyclic(&scc
);
491 /* Need backtracking ? */
492 if (!list_empty(&edge_stack
))
496 static void unix_walk_scc(struct sk_buff_head
*hitlist
)
498 unsigned long last_index
= UNIX_VERTEX_INDEX_START
;
500 unix_graph_maybe_cyclic
= false;
502 /* Visit every vertex exactly once.
503 * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
505 while (!list_empty(&unix_unvisited_vertices
)) {
506 struct unix_vertex
*vertex
;
508 vertex
= list_first_entry(&unix_unvisited_vertices
, typeof(*vertex
), entry
);
509 __unix_walk_scc(vertex
, &last_index
, hitlist
);
512 list_replace_init(&unix_visited_vertices
, &unix_unvisited_vertices
);
513 swap(unix_vertex_unvisited_index
, unix_vertex_grouped_index
);
515 unix_graph_grouped
= true;
518 static void unix_walk_scc_fast(struct sk_buff_head
*hitlist
)
520 unix_graph_maybe_cyclic
= false;
522 while (!list_empty(&unix_unvisited_vertices
)) {
523 struct unix_vertex
*vertex
;
524 struct list_head scc
;
525 bool scc_dead
= true;
527 vertex
= list_first_entry(&unix_unvisited_vertices
, typeof(*vertex
), entry
);
528 list_add(&scc
, &vertex
->scc_entry
);
530 list_for_each_entry_reverse(vertex
, &scc
, scc_entry
) {
531 list_move_tail(&vertex
->entry
, &unix_visited_vertices
);
534 scc_dead
= unix_vertex_dead(vertex
);
538 unix_collect_skb(&scc
, hitlist
);
539 else if (!unix_graph_maybe_cyclic
)
540 unix_graph_maybe_cyclic
= unix_scc_cyclic(&scc
);
545 list_replace_init(&unix_visited_vertices
, &unix_unvisited_vertices
);
548 static bool gc_in_progress
;
550 static void __unix_gc(struct work_struct
*work
)
552 struct sk_buff_head hitlist
;
555 spin_lock(&unix_gc_lock
);
557 if (!unix_graph_maybe_cyclic
) {
558 spin_unlock(&unix_gc_lock
);
562 __skb_queue_head_init(&hitlist
);
564 if (unix_graph_grouped
)
565 unix_walk_scc_fast(&hitlist
);
567 unix_walk_scc(&hitlist
);
569 spin_unlock(&unix_gc_lock
);
571 skb_queue_walk(&hitlist
, skb
) {
573 UNIXCB(skb
).fp
->dead
= true;
576 __skb_queue_purge(&hitlist
);
578 WRITE_ONCE(gc_in_progress
, false);
581 static DECLARE_WORK(unix_gc_work
, __unix_gc
);
585 WRITE_ONCE(gc_in_progress
, true);
586 queue_work(system_unbound_wq
, &unix_gc_work
);
589 #define UNIX_INFLIGHT_TRIGGER_GC 16000
590 #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
592 void wait_for_unix_gc(struct scm_fp_list
*fpl
)
594 /* If number of inflight sockets is insane,
595 * force a garbage collect right now.
597 * Paired with the WRITE_ONCE() in unix_inflight(),
598 * unix_notinflight(), and __unix_gc().
600 if (READ_ONCE(unix_tot_inflight
) > UNIX_INFLIGHT_TRIGGER_GC
&&
601 !READ_ONCE(gc_in_progress
))
604 /* Penalise users who want to send AF_UNIX sockets
605 * but whose sockets have not been received yet.
607 if (!fpl
|| !fpl
->count_unix
||
608 READ_ONCE(fpl
->user
->unix_inflight
) < UNIX_INFLIGHT_SANE_USER
)
611 if (READ_ONCE(gc_in_progress
))
612 flush_work(&unix_gc_work
);