1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <sys/statvfs.h>
9 #include "dirent-util.h"
13 #include "missing_magic.h"
14 #include "nspawn-def.h"
15 #include "nspawn-patch-uid.h"
16 #include "stat-util.h"
17 #include "stdio-util.h"
18 #include "string-util.h"
20 #include "user-util.h"
24 static int get_acl(int fd
, const char *name
, acl_type_t type
, acl_t
*ret
) {
31 _cleanup_close_
int child_fd
= -EBADF
;
33 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
37 acl
= acl_get_file(FORMAT_PROC_FD_PATH(child_fd
), type
);
38 } else if (type
== ACL_TYPE_ACCESS
)
41 acl
= acl_get_file(FORMAT_PROC_FD_PATH(fd
), type
);
49 static int set_acl(int fd
, const char *name
, acl_type_t type
, acl_t acl
) {
56 _cleanup_close_
int child_fd
= -EBADF
;
58 child_fd
= openat(fd
, name
, O_PATH
|O_CLOEXEC
|O_NOFOLLOW
);
62 r
= acl_set_file(FORMAT_PROC_FD_PATH(child_fd
), type
, acl
);
63 } else if (type
== ACL_TYPE_ACCESS
)
64 r
= acl_set_fd(fd
, acl
);
66 r
= acl_set_file(FORMAT_PROC_FD_PATH(fd
), type
, acl
);
73 static int shift_acl(acl_t acl
, uid_t shift
, acl_t
*ret
) {
74 _cleanup_(acl_freep
) acl_t copy
= NULL
;
81 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
85 uid_t
*old_uid
, new_uid
;
89 if (acl_get_tag_type(i
, &tag
) < 0)
92 if (IN_SET(tag
, ACL_USER
, ACL_GROUP
)) {
94 /* We don't distinguish here between uid_t and gid_t, let's make sure the compiler checks that
95 * this is actually OK */
96 assert_cc(sizeof(uid_t
) == sizeof(gid_t
));
98 old_uid
= acl_get_qualifier(i
);
102 new_uid
= shift
| (*old_uid
& UINT32_C(0xFFFF));
103 if (!uid_is_valid(new_uid
))
106 modify
= new_uid
!= *old_uid
;
107 if (modify
&& !copy
) {
110 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
111 * beginning, so that we copy all entries, starting from the first, this time. */
113 n
= acl_entries(acl
);
121 /* Seek back to the beginning */
122 r
= acl_get_entry(acl
, ACL_FIRST_ENTRY
, &i
);
130 acl_entry_t new_entry
;
132 if (acl_create_entry(©
, &new_entry
) < 0)
135 if (acl_copy_entry(new_entry
, i
) < 0)
139 if (acl_set_qualifier(new_entry
, &new_uid
) < 0)
143 r
= acl_get_entry(acl
, ACL_NEXT_ENTRY
, &i
);
148 *ret
= TAKE_PTR(copy
);
153 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
154 _cleanup_(acl_freep
) acl_t acl
= NULL
, shifted
= NULL
;
155 bool changed
= false;
161 /* ACLs are not supported on symlinks, there's no point in trying */
162 if (S_ISLNK(st
->st_mode
))
165 r
= get_acl(fd
, name
, ACL_TYPE_ACCESS
, &acl
);
166 if (r
== -EOPNOTSUPP
)
171 r
= shift_acl(acl
, shift
, &shifted
);
175 r
= set_acl(fd
, name
, ACL_TYPE_ACCESS
, shifted
);
182 if (S_ISDIR(st
->st_mode
)) {
188 acl
= shifted
= NULL
;
190 r
= get_acl(fd
, name
, ACL_TYPE_DEFAULT
, &acl
);
194 r
= shift_acl(acl
, shift
, &shifted
);
198 r
= set_acl(fd
, name
, ACL_TYPE_DEFAULT
, shifted
);
211 static int patch_acls(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
217 static int patch_fd(int fd
, const char *name
, const struct stat
*st
, uid_t shift
) {
220 bool changed
= false;
226 new_uid
= shift
| (st
->st_uid
& UINT32_C(0xFFFF));
227 new_gid
= (gid_t
) shift
| (st
->st_gid
& UINT32_C(0xFFFF));
229 if (!uid_is_valid(new_uid
) || !gid_is_valid(new_gid
))
232 if (st
->st_uid
!= new_uid
|| st
->st_gid
!= new_gid
) {
234 r
= fchownat(fd
, name
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
);
236 r
= fchown(fd
, new_uid
, new_gid
);
240 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
242 if (!S_ISLNK(st
->st_mode
))
243 r
= fchmodat(fd
, name
, st
->st_mode
, 0);
244 else /* Changing the mode of a symlink is not supported by Linux kernel. Don't bother. */
247 r
= fchmod(fd
, st
->st_mode
);
254 r
= patch_acls(fd
, name
, st
, shift
);
258 return r
> 0 || changed
;
262 * Check if the filesystem is fully compatible with user namespaces or
263 * UID/GID patching. Some filesystems in this list can be fully mounted inside
264 * user namespaces, however their inodes may relate to host resources or only
265 * valid in the global user namespace, therefore no patching should be applied.
267 static int is_fs_fully_userns_compatible(const struct statfs
*sfs
) {
271 return F_TYPE_EQUAL(sfs
->f_type
, BINFMTFS_MAGIC
) ||
272 F_TYPE_EQUAL(sfs
->f_type
, CGROUP_SUPER_MAGIC
) ||
273 F_TYPE_EQUAL(sfs
->f_type
, CGROUP2_SUPER_MAGIC
) ||
274 F_TYPE_EQUAL(sfs
->f_type
, DEBUGFS_MAGIC
) ||
275 F_TYPE_EQUAL(sfs
->f_type
, DEVPTS_SUPER_MAGIC
) ||
276 F_TYPE_EQUAL(sfs
->f_type
, EFIVARFS_MAGIC
) ||
277 F_TYPE_EQUAL(sfs
->f_type
, HUGETLBFS_MAGIC
) ||
278 F_TYPE_EQUAL(sfs
->f_type
, MQUEUE_MAGIC
) ||
279 F_TYPE_EQUAL(sfs
->f_type
, PROC_SUPER_MAGIC
) ||
280 F_TYPE_EQUAL(sfs
->f_type
, PSTOREFS_MAGIC
) ||
281 F_TYPE_EQUAL(sfs
->f_type
, SELINUX_MAGIC
) ||
282 F_TYPE_EQUAL(sfs
->f_type
, SMACK_MAGIC
) ||
283 F_TYPE_EQUAL(sfs
->f_type
, SECURITYFS_MAGIC
) ||
284 F_TYPE_EQUAL(sfs
->f_type
, BPF_FS_MAGIC
) ||
285 F_TYPE_EQUAL(sfs
->f_type
, TRACEFS_MAGIC
) ||
286 F_TYPE_EQUAL(sfs
->f_type
, SYSFS_MAGIC
);
289 static int recurse_fd(int fd
, bool donate_fd
, const struct stat
*st
, uid_t shift
, bool is_toplevel
) {
290 _cleanup_closedir_
DIR *d
= NULL
;
291 bool changed
= false;
297 if (fstatfs(fd
, &sfs
) < 0)
300 /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
301 * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
302 * when we hit procfs, sysfs or some other special file systems. */
304 r
= is_fs_fully_userns_compatible(&sfs
);
308 r
= 0; /* don't recurse */
312 /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
313 if ((sfs
.f_flags
& ST_RDONLY
) ||
314 access_fd(fd
, W_OK
) == -EROFS
)
317 if (S_ISDIR(st
->st_mode
)) {
321 copy
= fcntl(fd
, F_DUPFD_CLOEXEC
, 3);
331 d
= take_fdopendir(&fd
);
337 FOREACH_DIRENT_ALL(de
, d
, r
= -errno
; goto finish
) {
340 if (dot_or_dot_dot(de
->d_name
))
343 if (fstatat(dirfd(d
), de
->d_name
, &fst
, AT_SYMLINK_NOFOLLOW
) < 0) {
348 if (S_ISDIR(fst
.st_mode
)) {
351 subdir_fd
= openat(dirfd(d
), de
->d_name
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
358 r
= recurse_fd(subdir_fd
, true, &fst
, shift
, false);
365 r
= patch_fd(dirfd(d
), de
->d_name
, &fst
, shift
);
374 /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
375 * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
376 * tree is properly chown()ed already. */
377 r
= patch_fd(d
? dirfd(d
) : fd
, NULL
, st
, shift
);
388 _cleanup_free_
char *name
= NULL
;
390 /* When we hit a ready-only subtree we simply skip it, but log about it. */
391 (void) fd_get_path(fd
, &name
);
392 log_debug("Skipping read-only file or directory %s.", strna(name
));
403 static int fd_patch_uid_internal(int fd
, bool donate_fd
, uid_t shift
, uid_t range
) {
409 /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
410 * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
411 * following the concept that the upper 16-bit of a UID identify the container, and the lower 16-bit are the actual
412 * UID within the container. */
414 if ((shift
& 0xFFFF) != 0) {
415 /* We only support containers where the shift starts at a 2^16 boundary */
420 if (shift
== UID_BUSY_BASE
) {
425 if (range
!= 0x10000) {
426 /* We only support containers with 16-bit UID ranges for the patching logic */
431 if (fstat(fd
, &st
) < 0) {
436 if ((uint32_t) st
.st_uid
>> 16 != (uint32_t) st
.st_gid
>> 16) {
437 /* We only support containers where the uid/gid container ID match */
442 /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
443 * that if the top-level dir has the right upper 16-bit assigned, then everything below will have too... */
444 if (((uint32_t) (st
.st_uid
^ shift
) >> 16) == 0)
447 /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
448 * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
449 * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
451 if ((st
.st_uid
& UID_BUSY_MASK
) != UID_BUSY_BASE
) {
453 UID_BUSY_BASE
| (st
.st_uid
& ~UID_BUSY_MASK
),
454 (gid_t
) UID_BUSY_BASE
| (st
.st_gid
& ~(gid_t
) UID_BUSY_MASK
)) < 0) {
460 return recurse_fd(fd
, donate_fd
, &st
, shift
, true);
469 int path_patch_uid(const char *path
, uid_t shift
, uid_t range
) {
472 fd
= open(path
, O_RDONLY
|O_NONBLOCK
|O_DIRECTORY
|O_CLOEXEC
|O_NOFOLLOW
|O_NOATIME
);
476 return fd_patch_uid_internal(fd
, true, shift
, range
);