1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include "alloc-util.h"
6 #include "cgroup-setup.h"
9 #include "format-util.h"
12 #include "mount-util.h"
13 #include "mountpoint-util.h"
14 #include "nspawn-cgroup.h"
15 #include "nspawn-mount.h"
16 #include "path-util.h"
18 #include "string-util.h"
20 #include "user-util.h"
22 static int chown_cgroup_path(const char *path
, uid_t uid_shift
) {
23 _cleanup_close_
int fd
= -EBADF
;
25 fd
= open(path
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
31 "cgroup.clone_children",
36 "cgroup.subtree_control",
42 if (fchownat(fd
, fn
, uid_shift
, uid_shift
, 0) < 0)
43 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
,
44 "Failed to chown \"%s/%s\", ignoring: %m", path
, fn
);
49 int chown_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
50 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
53 r
= cg_pid_get_path(NULL
, pid
, &path
);
55 return log_error_errno(r
, "Failed to get container cgroup path: %m");
57 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
59 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
61 r
= chown_cgroup_path(fs
, uid_shift
);
63 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", fs
);
65 if (unified_requested
== CGROUP_UNIFIED_SYSTEMD
|| (unified_requested
== CGROUP_UNIFIED_NONE
&& cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)) {
66 _cleanup_free_
char *lfs
= NULL
;
67 /* Always propagate access rights from unified to legacy controller */
69 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY
, path
, NULL
, &lfs
);
71 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
73 r
= chown_cgroup_path(lfs
, uid_shift
);
75 return log_error_errno(r
, "Failed to chown() cgroup %s: %m", lfs
);
81 int sync_cgroup(pid_t pid
, CGroupUnified unified_requested
, uid_t uid_shift
) {
82 _cleanup_free_
char *cgroup
= NULL
;
83 char tree
[] = "/tmp/unifiedXXXXXX", pid_string
[DECIMAL_STR_MAX(pid
) + 1];
84 bool undo_mount
= false;
86 int r
, unified_controller
;
88 unified_controller
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
89 if (unified_controller
< 0)
90 return log_error_errno(unified_controller
, "Failed to determine whether the systemd hierarchy is unified: %m");
91 if ((unified_controller
> 0) == (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
))
94 /* When the host uses the legacy cgroup setup, but the
95 * container shall use the unified hierarchy, let's make sure
96 * we copy the path from the name=systemd hierarchy into the
97 * unified hierarchy. Similar for the reverse situation. */
99 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
101 return log_error_errno(r
, "Failed to get control group of " PID_FMT
": %m", pid
);
103 /* In order to access the unified hierarchy we need to mount it */
105 return log_error_errno(errno
, "Failed to generate temporary mount point for unified hierarchy: %m");
107 if (unified_controller
> 0)
108 r
= mount_nofollow_verbose(LOG_ERR
, "cgroup", tree
, "cgroup",
109 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "none,name=systemd,xattr");
111 r
= mount_nofollow_verbose(LOG_ERR
, "cgroup", tree
, "cgroup2",
112 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
118 /* If nspawn dies abruptly the cgroup hierarchy created below
119 * its unit isn't cleaned up. So, let's remove it
120 * https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
121 fn
= strjoina(tree
, cgroup
);
122 (void) rm_rf(fn
, REMOVE_ROOT
|REMOVE_ONLY_DIRECTORIES
);
124 fn
= strjoina(tree
, cgroup
, "/cgroup.procs");
126 sprintf(pid_string
, PID_FMT
, pid
);
127 r
= write_string_file(fn
, pid_string
, WRITE_STRING_FILE_DISABLE_BUFFER
|WRITE_STRING_FILE_MKDIR_0755
);
129 log_error_errno(r
, "Failed to move process: %m");
133 fn
= strjoina(tree
, cgroup
);
134 r
= chown_cgroup_path(fn
, uid_shift
);
136 log_error_errno(r
, "Failed to chown() cgroup %s: %m", fn
);
139 (void) umount_verbose(LOG_ERR
, tree
, UMOUNT_NOFOLLOW
);
145 int create_subcgroup(pid_t pid
, bool keep_unit
, CGroupUnified unified_requested
) {
146 _cleanup_free_
char *cgroup
= NULL
, *payload
= NULL
;
147 CGroupMask supported
;
153 /* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
154 * the unified hierarchy and the container does the same, and we did not create a scope unit for the container
155 * move us and the container into two separate subcgroups.
157 * Moreover, container payloads such as systemd try to manage the cgroup they run in full (i.e. including
158 * its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
159 * delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
160 * host systemd directly to the payload, the host and payload systemd might fight for the cgroup
161 * attributes. Hence, let's insert an intermediary cgroup to cover that case too.
163 * Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
164 * that's fine because there's only one hierarchy anyway and controllers are enabled directly on it. On the
165 * legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
168 r
= cg_mask_supported(&supported
);
170 return log_error_errno(r
, "Failed to determine supported controllers: %m");
173 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, 0, &cgroup
);
175 r
= cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER
, pid
, &cgroup
);
177 return log_error_errno(r
, "Failed to get our control group: %m");
179 /* If the service manager already placed us in the supervisor cgroup, let's handle that. */
180 e
= endswith(cgroup
, "/supervisor");
182 *e
= 0; /* chop off, we want the main path delegated to us */
184 payload
= path_join(cgroup
, "payload");
188 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, payload
, pid
);
190 return log_error_errno(r
, "Failed to create %s subcgroup: %m", payload
);
193 _cleanup_free_
char *supervisor
= NULL
;
195 supervisor
= path_join(cgroup
, "supervisor");
199 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, supervisor
, 0);
201 return log_error_errno(r
, "Failed to create %s subcgroup: %m", supervisor
);
204 /* Try to enable as many controllers as possible for the new payload. */
205 (void) cg_enable_everywhere(supported
, supported
, cgroup
, NULL
);
209 /* Retrieve existing subsystems. This function is called in a new cgroup
212 static int get_process_controllers(Set
**ret
) {
213 _cleanup_set_free_ Set
*controllers
= NULL
;
214 _cleanup_fclose_
FILE *f
= NULL
;
219 f
= fopen("/proc/self/cgroup", "re");
221 return errno
== ENOENT
? -ESRCH
: -errno
;
224 _cleanup_free_
char *line
= NULL
;
227 r
= read_line(f
, LONG_LINE_MAX
, &line
);
233 l
= strchr(line
, ':');
244 if (STR_IN_SET(l
, "", "name=systemd", "name=unified"))
247 r
= set_put_strdup(&controllers
, l
);
252 *ret
= TAKE_PTR(controllers
);
257 static int mount_legacy_cgroup_hierarchy(
259 const char *controller
,
260 const char *hierarchy
,
263 const char *to
, *fstype
, *opts
;
266 to
= strjoina(strempty(dest
), "/sys/fs/cgroup/", hierarchy
);
268 r
= path_is_mount_point(to
, dest
, 0);
269 if (r
< 0 && r
!= -ENOENT
)
270 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
274 (void) mkdir_p(to
, 0755);
276 /* The superblock mount options of the mount point need to be
277 * identical to the hosts', and hence writable... */
278 if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
)) {
281 } else if (streq(controller
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
)) {
283 opts
= "none,name=systemd,xattr";
289 r
= mount_nofollow_verbose(LOG_ERR
, "cgroup", to
, fstype
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, opts
);
293 /* ... hence let's only make the bind mount read-only, not the superblock. */
295 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, to
, NULL
,
296 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
304 /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
305 static int mount_legacy_cgns_supported(
307 CGroupUnified unified_requested
,
311 const char *selinux_apifs_context
) {
313 _cleanup_set_free_ Set
*controllers
= NULL
;
314 const char *cgroup_root
= "/sys/fs/cgroup", *c
;
317 (void) mkdir_p(cgroup_root
, 0755);
319 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
320 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
322 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
324 _cleanup_free_
char *options
= NULL
;
326 /* When cgroup namespaces are enabled and user namespaces are
327 * used then the mount of the cgroupfs is done *inside* the new
328 * user namespace. We're root in the new user namespace and the
329 * kernel will happily translate our uid/gid to the correct
330 * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
331 * pass uid 0 and not uid_shift to tmpfs_patch_options().
333 r
= tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP
, 0, selinux_apifs_context
, &options
);
337 r
= mount_nofollow_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
338 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
343 r
= cg_all_unified();
347 goto skip_controllers
;
349 r
= get_process_controllers(&controllers
);
351 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
354 _cleanup_free_
const char *controller
= NULL
;
356 controller
= set_steal_first(controllers
);
360 r
= mount_legacy_cgroup_hierarchy("", controller
, controller
, !userns
);
364 /* When multiple hierarchies are co-mounted, make their
365 * constituting individual hierarchies a symlink to the
370 _cleanup_free_
char *target
= NULL
, *tok
= NULL
;
372 r
= extract_first_word(&c
, &tok
, ",", 0);
374 return log_error_errno(r
, "Failed to extract co-mounted cgroup controller: %m");
378 if (streq(controller
, tok
))
381 target
= path_join("/sys/fs/cgroup/", tok
);
385 r
= symlink_idempotent(controller
, target
, false);
387 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
389 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
394 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
395 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
400 r
= mount_legacy_cgroup_hierarchy("", SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
405 return mount_nofollow_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
406 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
,
412 /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
413 static int mount_legacy_cgns_unsupported(
415 CGroupUnified unified_requested
,
419 const char *selinux_apifs_context
) {
421 _cleanup_set_free_ Set
*controllers
= NULL
;
422 const char *cgroup_root
;
425 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
427 (void) mkdir_p(cgroup_root
, 0755);
429 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
430 r
= path_is_mount_point(cgroup_root
, dest
, AT_SYMLINK_FOLLOW
);
432 return log_error_errno(r
, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
434 _cleanup_free_
char *options
= NULL
;
436 r
= tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP
,
437 uid_shift
== 0 ? UID_INVALID
: uid_shift
,
438 selinux_apifs_context
,
443 r
= mount_nofollow_verbose(LOG_ERR
, "tmpfs", cgroup_root
, "tmpfs",
444 MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, options
);
449 r
= cg_all_unified();
453 goto skip_controllers
;
455 r
= cg_kernel_controllers(&controllers
);
457 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
460 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
462 controller
= set_steal_first(controllers
);
466 origin
= path_join("/sys/fs/cgroup/", controller
);
470 r
= readlink_malloc(origin
, &combined
);
472 /* Not a symbolic link, but directly a single cgroup hierarchy */
474 r
= mount_legacy_cgroup_hierarchy(dest
, controller
, controller
, true);
479 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
481 _cleanup_free_
char *target
= NULL
;
483 target
= path_join(dest
, origin
);
487 /* A symbolic link, a combination of controllers in one hierarchy */
489 if (!filename_is_valid(combined
)) {
490 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
494 r
= mount_legacy_cgroup_hierarchy(dest
, combined
, combined
, true);
498 r
= symlink_idempotent(combined
, target
, false);
500 return log_error_errno(r
, "Invalid existing symlink for combined hierarchy: %m");
502 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
507 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
508 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_HYBRID
, "unified", false);
513 r
= mount_legacy_cgroup_hierarchy(dest
, SYSTEMD_CGROUP_CONTROLLER_LEGACY
, "systemd", false);
517 return mount_nofollow_verbose(LOG_ERR
, NULL
, cgroup_root
, NULL
,
518 MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
,
522 static int mount_unified_cgroups(const char *dest
) {
528 p
= prefix_roota(dest
, "/sys/fs/cgroup");
530 (void) mkdir_p(p
, 0755);
532 r
= path_is_mount_point(p
, dest
, AT_SYMLINK_FOLLOW
);
534 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", p
);
536 p
= prefix_roota(dest
, "/sys/fs/cgroup/cgroup.procs");
537 if (access(p
, F_OK
) >= 0)
540 return log_error_errno(errno
, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p
);
542 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
543 "%s is already mounted but not a unified cgroup hierarchy. Refusing.", p
);
546 return mount_nofollow_verbose(LOG_ERR
, "cgroup", p
, "cgroup2", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
551 CGroupUnified unified_requested
,
555 const char *selinux_apifs_context
,
558 if (unified_requested
>= CGROUP_UNIFIED_ALL
)
559 return mount_unified_cgroups(dest
);
561 return mount_legacy_cgns_supported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
563 return mount_legacy_cgns_unsupported(dest
, unified_requested
, userns
, uid_shift
, uid_range
, selinux_apifs_context
);
566 static int mount_systemd_cgroup_writable_one(const char *root
, const char *own
) {
572 /* Make our own cgroup a (writable) bind mount */
573 r
= mount_nofollow_verbose(LOG_ERR
, own
, own
, NULL
, MS_BIND
, NULL
);
577 /* And then remount the systemd cgroup root read-only */
578 return mount_nofollow_verbose(LOG_ERR
, NULL
, root
, NULL
,
579 MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
);
582 int mount_systemd_cgroup_writable(
584 CGroupUnified unified_requested
) {
586 _cleanup_free_
char *own_cgroup_path
= NULL
;
587 const char *root
, *own
;
592 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
594 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
596 /* If we are living in the top-level, then there's nothing to do... */
597 if (path_equal(own_cgroup_path
, "/"))
600 if (unified_requested
>= CGROUP_UNIFIED_ALL
) {
602 root
= prefix_roota(dest
, "/sys/fs/cgroup");
603 own
= strjoina(root
, own_cgroup_path
);
607 if (unified_requested
>= CGROUP_UNIFIED_SYSTEMD
) {
608 root
= prefix_roota(dest
, "/sys/fs/cgroup/unified");
609 own
= strjoina(root
, own_cgroup_path
);
611 r
= mount_systemd_cgroup_writable_one(root
, own
);
616 root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
617 own
= strjoina(root
, own_cgroup_path
);
620 return mount_systemd_cgroup_writable_one(root
, own
);