1 // SPDX-License-Identifier: GPL-2.0-only
3 * Process number limiting controller for cgroups.
5 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
6 * after a certain limit is reached.
8 * Since it is trivial to hit the task limit without hitting any kmemcg limits
9 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
10 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
11 * of the number of tasks in a cgroup.
13 * In order to use the `pids` controller, set the maximum number of tasks in
14 * pids.max (this is not available in the root cgroup for obvious reasons). The
15 * number of processes currently in the cgroup is given by pids.current.
16 * Organisational operations are not blocked by cgroup policies, so it is
17 * possible to have pids.current > pids.max. However, it is not possible to
18 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
19 * would cause a cgroup policy to be violated.
21 * To set a cgroup to have no limit, set pids.max to "max". This is the default
22 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
23 * stringent limit in the hierarchy is followed).
25 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
26 * a superset of parent/child/pids.current.
28 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
31 #include <linux/kernel.h>
32 #include <linux/threads.h>
33 #include <linux/atomic.h>
34 #include <linux/cgroup.h>
35 #include <linux/slab.h>
36 #include <linux/sched/task.h>
38 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
39 #define PIDS_MAX_STR "max"
42 /* Fork failed in subtree because this pids_cgroup limit was hit. */
44 /* Fork failed in this pids_cgroup because ancestor limit was hit. */
50 struct cgroup_subsys_state css
;
53 * Use 64-bit types so that we can safely represent "max" as
54 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
60 /* Handles for pids.events[.local] */
61 struct cgroup_file events_file
;
62 struct cgroup_file events_local_file
;
64 atomic64_t events
[NR_PIDCG_EVENTS
];
65 atomic64_t events_local
[NR_PIDCG_EVENTS
];
68 static struct pids_cgroup
*css_pids(struct cgroup_subsys_state
*css
)
70 return container_of(css
, struct pids_cgroup
, css
);
73 static struct pids_cgroup
*parent_pids(struct pids_cgroup
*pids
)
75 return css_pids(pids
->css
.parent
);
78 static struct cgroup_subsys_state
*
79 pids_css_alloc(struct cgroup_subsys_state
*parent
)
81 struct pids_cgroup
*pids
;
83 pids
= kzalloc(sizeof(struct pids_cgroup
), GFP_KERNEL
);
85 return ERR_PTR(-ENOMEM
);
87 atomic64_set(&pids
->limit
, PIDS_MAX
);
91 static void pids_css_free(struct cgroup_subsys_state
*css
)
96 static void pids_update_watermark(struct pids_cgroup
*p
, int64_t nr_pids
)
99 * This is racy, but we don't need perfectly accurate tallying of
100 * the watermark, and this lets us avoid extra atomic overhead.
102 if (nr_pids
> READ_ONCE(p
->watermark
))
103 WRITE_ONCE(p
->watermark
, nr_pids
);
107 * pids_cancel - uncharge the local pid count
108 * @pids: the pid cgroup state
109 * @num: the number of pids to cancel
111 * This function will WARN if the pid count goes under 0, because such a case is
112 * a bug in the pids controller proper.
114 static void pids_cancel(struct pids_cgroup
*pids
, int num
)
117 * A negative count (or overflow for that matter) is invalid,
118 * and indicates a bug in the `pids` controller proper.
120 WARN_ON_ONCE(atomic64_add_negative(-num
, &pids
->counter
));
124 * pids_uncharge - hierarchically uncharge the pid count
125 * @pids: the pid cgroup state
126 * @num: the number of pids to uncharge
128 static void pids_uncharge(struct pids_cgroup
*pids
, int num
)
130 struct pids_cgroup
*p
;
132 for (p
= pids
; parent_pids(p
); p
= parent_pids(p
))
137 * pids_charge - hierarchically charge the pid count
138 * @pids: the pid cgroup state
139 * @num: the number of pids to charge
141 * This function does *not* follow the pid limit set. It cannot fail and the new
142 * pid count may exceed the limit. This is only used for reverting failed
143 * attaches, where there is no other way out than violating the limit.
145 static void pids_charge(struct pids_cgroup
*pids
, int num
)
147 struct pids_cgroup
*p
;
149 for (p
= pids
; parent_pids(p
); p
= parent_pids(p
)) {
150 int64_t new = atomic64_add_return(num
, &p
->counter
);
152 pids_update_watermark(p
, new);
157 * pids_try_charge - hierarchically try to charge the pid count
158 * @pids: the pid cgroup state
159 * @num: the number of pids to charge
160 * @fail: storage of pid cgroup causing the fail
162 * This function follows the set limit. It will fail if the charge would cause
163 * the new value to exceed the hierarchical limit. Returns 0 if the charge
164 * succeeded, otherwise -EAGAIN.
166 static int pids_try_charge(struct pids_cgroup
*pids
, int num
, struct pids_cgroup
**fail
)
168 struct pids_cgroup
*p
, *q
;
170 for (p
= pids
; parent_pids(p
); p
= parent_pids(p
)) {
171 int64_t new = atomic64_add_return(num
, &p
->counter
);
172 int64_t limit
= atomic64_read(&p
->limit
);
175 * Since new is capped to the maximum number of pid_t, if
176 * p->limit is %PIDS_MAX then we know that this test will never
184 * Not technically accurate if we go over limit somewhere up
185 * the hierarchy, but that's tolerable for the watermark.
187 pids_update_watermark(p
, new);
193 for (q
= pids
; q
!= p
; q
= parent_pids(q
))
200 static int pids_can_attach(struct cgroup_taskset
*tset
)
202 struct task_struct
*task
;
203 struct cgroup_subsys_state
*dst_css
;
205 cgroup_taskset_for_each(task
, dst_css
, tset
) {
206 struct pids_cgroup
*pids
= css_pids(dst_css
);
207 struct cgroup_subsys_state
*old_css
;
208 struct pids_cgroup
*old_pids
;
211 * No need to pin @old_css between here and cancel_attach()
212 * because cgroup core protects it from being freed before
213 * the migration completes or fails.
215 old_css
= task_css(task
, pids_cgrp_id
);
216 old_pids
= css_pids(old_css
);
218 pids_charge(pids
, 1);
219 pids_uncharge(old_pids
, 1);
225 static void pids_cancel_attach(struct cgroup_taskset
*tset
)
227 struct task_struct
*task
;
228 struct cgroup_subsys_state
*dst_css
;
230 cgroup_taskset_for_each(task
, dst_css
, tset
) {
231 struct pids_cgroup
*pids
= css_pids(dst_css
);
232 struct cgroup_subsys_state
*old_css
;
233 struct pids_cgroup
*old_pids
;
235 old_css
= task_css(task
, pids_cgrp_id
);
236 old_pids
= css_pids(old_css
);
238 pids_charge(old_pids
, 1);
239 pids_uncharge(pids
, 1);
243 static void pids_event(struct pids_cgroup
*pids_forking
,
244 struct pids_cgroup
*pids_over_limit
)
246 struct pids_cgroup
*p
= pids_forking
;
248 /* Only log the first time limit is hit. */
249 if (atomic64_inc_return(&p
->events_local
[PIDCG_FORKFAIL
]) == 1) {
250 pr_info("cgroup: fork rejected by pids controller in ");
251 pr_cont_cgroup_path(p
->css
.cgroup
);
254 if (!cgroup_subsys_on_dfl(pids_cgrp_subsys
) ||
255 cgrp_dfl_root
.flags
& CGRP_ROOT_PIDS_LOCAL_EVENTS
) {
256 cgroup_file_notify(&p
->events_local_file
);
260 atomic64_inc(&pids_over_limit
->events_local
[PIDCG_MAX
]);
261 cgroup_file_notify(&pids_over_limit
->events_local_file
);
263 for (p
= pids_over_limit
; parent_pids(p
); p
= parent_pids(p
)) {
264 atomic64_inc(&p
->events
[PIDCG_MAX
]);
265 cgroup_file_notify(&p
->events_file
);
270 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
271 * on cgroup_threadgroup_change_begin() held by the copy_process().
273 static int pids_can_fork(struct task_struct
*task
, struct css_set
*cset
)
275 struct pids_cgroup
*pids
, *pids_over_limit
;
278 pids
= css_pids(cset
->subsys
[pids_cgrp_id
]);
279 err
= pids_try_charge(pids
, 1, &pids_over_limit
);
281 pids_event(pids
, pids_over_limit
);
286 static void pids_cancel_fork(struct task_struct
*task
, struct css_set
*cset
)
288 struct pids_cgroup
*pids
;
290 pids
= css_pids(cset
->subsys
[pids_cgrp_id
]);
291 pids_uncharge(pids
, 1);
294 static void pids_release(struct task_struct
*task
)
296 struct pids_cgroup
*pids
= css_pids(task_css(task
, pids_cgrp_id
));
298 pids_uncharge(pids
, 1);
301 static ssize_t
pids_max_write(struct kernfs_open_file
*of
, char *buf
,
302 size_t nbytes
, loff_t off
)
304 struct cgroup_subsys_state
*css
= of_css(of
);
305 struct pids_cgroup
*pids
= css_pids(css
);
310 if (!strcmp(buf
, PIDS_MAX_STR
)) {
315 err
= kstrtoll(buf
, 0, &limit
);
319 if (limit
< 0 || limit
>= PIDS_MAX
)
324 * Limit updates don't need to be mutex'd, since it isn't
325 * critical that any racing fork()s follow the new limit.
327 atomic64_set(&pids
->limit
, limit
);
331 static int pids_max_show(struct seq_file
*sf
, void *v
)
333 struct cgroup_subsys_state
*css
= seq_css(sf
);
334 struct pids_cgroup
*pids
= css_pids(css
);
335 int64_t limit
= atomic64_read(&pids
->limit
);
337 if (limit
>= PIDS_MAX
)
338 seq_printf(sf
, "%s\n", PIDS_MAX_STR
);
340 seq_printf(sf
, "%lld\n", limit
);
345 static s64
pids_current_read(struct cgroup_subsys_state
*css
,
348 struct pids_cgroup
*pids
= css_pids(css
);
350 return atomic64_read(&pids
->counter
);
353 static s64
pids_peak_read(struct cgroup_subsys_state
*css
,
356 struct pids_cgroup
*pids
= css_pids(css
);
358 return READ_ONCE(pids
->watermark
);
361 static int __pids_events_show(struct seq_file
*sf
, bool local
)
363 struct pids_cgroup
*pids
= css_pids(seq_css(sf
));
364 enum pidcg_event pe
= PIDCG_MAX
;
367 if (!cgroup_subsys_on_dfl(pids_cgrp_subsys
) ||
368 cgrp_dfl_root
.flags
& CGRP_ROOT_PIDS_LOCAL_EVENTS
) {
372 events
= local
? pids
->events_local
: pids
->events
;
374 seq_printf(sf
, "max %lld\n", (s64
)atomic64_read(&events
[pe
]));
378 static int pids_events_show(struct seq_file
*sf
, void *v
)
380 __pids_events_show(sf
, false);
384 static int pids_events_local_show(struct seq_file
*sf
, void *v
)
386 __pids_events_show(sf
, true);
390 static struct cftype pids_files
[] = {
393 .write
= pids_max_write
,
394 .seq_show
= pids_max_show
,
395 .flags
= CFTYPE_NOT_ON_ROOT
,
399 .read_s64
= pids_current_read
,
400 .flags
= CFTYPE_NOT_ON_ROOT
,
404 .flags
= CFTYPE_NOT_ON_ROOT
,
405 .read_s64
= pids_peak_read
,
409 .seq_show
= pids_events_show
,
410 .file_offset
= offsetof(struct pids_cgroup
, events_file
),
411 .flags
= CFTYPE_NOT_ON_ROOT
,
414 .name
= "events.local",
415 .seq_show
= pids_events_local_show
,
416 .file_offset
= offsetof(struct pids_cgroup
, events_local_file
),
417 .flags
= CFTYPE_NOT_ON_ROOT
,
422 static struct cftype pids_files_legacy
[] = {
425 .write
= pids_max_write
,
426 .seq_show
= pids_max_show
,
427 .flags
= CFTYPE_NOT_ON_ROOT
,
431 .read_s64
= pids_current_read
,
432 .flags
= CFTYPE_NOT_ON_ROOT
,
436 .flags
= CFTYPE_NOT_ON_ROOT
,
437 .read_s64
= pids_peak_read
,
441 .seq_show
= pids_events_show
,
442 .file_offset
= offsetof(struct pids_cgroup
, events_file
),
443 .flags
= CFTYPE_NOT_ON_ROOT
,
449 struct cgroup_subsys pids_cgrp_subsys
= {
450 .css_alloc
= pids_css_alloc
,
451 .css_free
= pids_css_free
,
452 .can_attach
= pids_can_attach
,
453 .cancel_attach
= pids_cancel_attach
,
454 .can_fork
= pids_can_fork
,
455 .cancel_fork
= pids_cancel_fork
,
456 .release
= pids_release
,
457 .legacy_cftypes
= pids_files_legacy
,
458 .dfl_cftypes
= pids_files
,