1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* Common capabilities, needed by capability.o.
5 #include <linux/capability.h>
6 #include <linux/audit.h>
7 #include <linux/init.h>
8 #include <linux/kernel.h>
9 #include <linux/lsm_hooks.h>
10 #include <linux/file.h>
12 #include <linux/mman.h>
13 #include <linux/pagemap.h>
14 #include <linux/swap.h>
15 #include <linux/skbuff.h>
16 #include <linux/netlink.h>
17 #include <linux/ptrace.h>
18 #include <linux/xattr.h>
19 #include <linux/hugetlb.h>
20 #include <linux/mount.h>
21 #include <linux/sched.h>
22 #include <linux/prctl.h>
23 #include <linux/securebits.h>
24 #include <linux/user_namespace.h>
25 #include <linux/binfmts.h>
26 #include <linux/personality.h>
27 #include <linux/mnt_idmapping.h>
28 #include <uapi/linux/lsm.h>
31 * If a non-root user executes a setuid-root binary in
32 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
33 * However if fE is also set, then the intent is for only
34 * the file capabilities to be applied, and the setuid-root
35 * bit is left on either to change the uid (plausible) or
36 * to get full privilege on a kernel without file capabilities
37 * support. So in that case we do not raise capabilities.
39 * Warn if that happens, once per boot.
41 static void warn_setuid_and_fcaps_mixed(const char *fname
)
45 printk(KERN_INFO
"warning: `%s' has both setuid-root and"
46 " effective capabilities. Therefore not raising all"
47 " capabilities.\n", fname
);
53 * cap_capable - Determine whether a task has a particular effective capability
54 * @cred: The credentials to use
55 * @targ_ns: The user namespace in which we need the capability
56 * @cap: The capability to check for
57 * @opts: Bitmask of options defined in include/linux/security.h
59 * Determine whether the nominated task has the specified capability amongst
60 * its effective set, returning 0 if it does, -ve if it does not.
62 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
63 * and has_capability() functions. That is, it has the reverse semantics:
64 * cap_has_capability() returns 0 when a task has a capability, but the
65 * kernel's capable() and has_capability() returns 1 for this case.
67 int cap_capable(const struct cred
*cred
, struct user_namespace
*targ_ns
,
68 int cap
, unsigned int opts
)
70 struct user_namespace
*ns
= targ_ns
;
72 /* See if cred has the capability in the target user namespace
73 * by examining the target user namespace and all of the target
74 * user namespace's parents.
77 /* Do we have the necessary capabilities? */
78 if (ns
== cred
->user_ns
)
79 return cap_raised(cred
->cap_effective
, cap
) ? 0 : -EPERM
;
82 * If we're already at a lower level than we're looking for,
83 * we're done searching.
85 if (ns
->level
<= cred
->user_ns
->level
)
89 * The owner of the user namespace in the parent of the
90 * user namespace has all caps.
92 if ((ns
->parent
== cred
->user_ns
) && uid_eq(ns
->owner
, cred
->euid
))
96 * If you have a capability in a parent user ns, then you have
97 * it over all children user namespaces as well.
102 /* We never get here */
106 * cap_settime - Determine whether the current process may set the system clock
107 * @ts: The time to set
108 * @tz: The timezone to set
110 * Determine whether the current process may set the system clock and timezone
111 * information, returning 0 if permission granted, -ve if denied.
113 int cap_settime(const struct timespec64
*ts
, const struct timezone
*tz
)
115 if (!capable(CAP_SYS_TIME
))
121 * cap_ptrace_access_check - Determine whether the current process may access
123 * @child: The process to be accessed
124 * @mode: The mode of attachment.
126 * If we are in the same or an ancestor user_ns and have all the target
127 * task's capabilities, then ptrace access is allowed.
128 * If we have the ptrace capability to the target user_ns, then ptrace
132 * Determine whether a process may access another, returning 0 if permission
133 * granted, -ve if denied.
135 int cap_ptrace_access_check(struct task_struct
*child
, unsigned int mode
)
138 const struct cred
*cred
, *child_cred
;
139 const kernel_cap_t
*caller_caps
;
142 cred
= current_cred();
143 child_cred
= __task_cred(child
);
144 if (mode
& PTRACE_MODE_FSCREDS
)
145 caller_caps
= &cred
->cap_effective
;
147 caller_caps
= &cred
->cap_permitted
;
148 if (cred
->user_ns
== child_cred
->user_ns
&&
149 cap_issubset(child_cred
->cap_permitted
, *caller_caps
))
151 if (ns_capable(child_cred
->user_ns
, CAP_SYS_PTRACE
))
160 * cap_ptrace_traceme - Determine whether another process may trace the current
161 * @parent: The task proposed to be the tracer
163 * If parent is in the same or an ancestor user_ns and has all current's
164 * capabilities, then ptrace access is allowed.
165 * If parent has the ptrace capability to current's user_ns, then ptrace
169 * Determine whether the nominated task is permitted to trace the current
170 * process, returning 0 if permission is granted, -ve if denied.
172 int cap_ptrace_traceme(struct task_struct
*parent
)
175 const struct cred
*cred
, *child_cred
;
178 cred
= __task_cred(parent
);
179 child_cred
= current_cred();
180 if (cred
->user_ns
== child_cred
->user_ns
&&
181 cap_issubset(child_cred
->cap_permitted
, cred
->cap_permitted
))
183 if (has_ns_capability(parent
, child_cred
->user_ns
, CAP_SYS_PTRACE
))
192 * cap_capget - Retrieve a task's capability sets
193 * @target: The task from which to retrieve the capability sets
194 * @effective: The place to record the effective set
195 * @inheritable: The place to record the inheritable set
196 * @permitted: The place to record the permitted set
198 * This function retrieves the capabilities of the nominated task and returns
199 * them to the caller.
201 int cap_capget(const struct task_struct
*target
, kernel_cap_t
*effective
,
202 kernel_cap_t
*inheritable
, kernel_cap_t
*permitted
)
204 const struct cred
*cred
;
206 /* Derived from kernel/capability.c:sys_capget. */
208 cred
= __task_cred(target
);
209 *effective
= cred
->cap_effective
;
210 *inheritable
= cred
->cap_inheritable
;
211 *permitted
= cred
->cap_permitted
;
217 * Determine whether the inheritable capabilities are limited to the old
218 * permitted set. Returns 1 if they are limited, 0 if they are not.
220 static inline int cap_inh_is_capped(void)
222 /* they are so limited unless the current task has the CAP_SETPCAP
225 if (cap_capable(current_cred(), current_cred()->user_ns
,
226 CAP_SETPCAP
, CAP_OPT_NONE
) == 0)
232 * cap_capset - Validate and apply proposed changes to current's capabilities
233 * @new: The proposed new credentials; alterations should be made here
234 * @old: The current task's current credentials
235 * @effective: A pointer to the proposed new effective capabilities set
236 * @inheritable: A pointer to the proposed new inheritable capabilities set
237 * @permitted: A pointer to the proposed new permitted capabilities set
239 * This function validates and applies a proposed mass change to the current
240 * process's capability sets. The changes are made to the proposed new
241 * credentials, and assuming no error, will be committed by the caller of LSM.
243 int cap_capset(struct cred
*new,
244 const struct cred
*old
,
245 const kernel_cap_t
*effective
,
246 const kernel_cap_t
*inheritable
,
247 const kernel_cap_t
*permitted
)
249 if (cap_inh_is_capped() &&
250 !cap_issubset(*inheritable
,
251 cap_combine(old
->cap_inheritable
,
252 old
->cap_permitted
)))
253 /* incapable of using this inheritable set */
256 if (!cap_issubset(*inheritable
,
257 cap_combine(old
->cap_inheritable
,
259 /* no new pI capabilities outside bounding set */
262 /* verify restrictions on target's new Permitted set */
263 if (!cap_issubset(*permitted
, old
->cap_permitted
))
266 /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
267 if (!cap_issubset(*effective
, *permitted
))
270 new->cap_effective
= *effective
;
271 new->cap_inheritable
= *inheritable
;
272 new->cap_permitted
= *permitted
;
275 * Mask off ambient bits that are no longer both permitted and
278 new->cap_ambient
= cap_intersect(new->cap_ambient
,
279 cap_intersect(*permitted
,
281 if (WARN_ON(!cap_ambient_invariant_ok(new)))
287 * cap_inode_need_killpriv - Determine if inode change affects privileges
288 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
290 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
291 * affects the security markings on that inode, and if it is, should
292 * inode_killpriv() be invoked or the change rejected.
294 * Return: 1 if security.capability has a value, meaning inode_killpriv()
295 * is required, 0 otherwise, meaning inode_killpriv() is not required.
297 int cap_inode_need_killpriv(struct dentry
*dentry
)
299 struct inode
*inode
= d_backing_inode(dentry
);
302 error
= __vfs_getxattr(dentry
, inode
, XATTR_NAME_CAPS
, NULL
, 0);
307 * cap_inode_killpriv - Erase the security markings on an inode
309 * @idmap: idmap of the mount the inode was found from
310 * @dentry: The inode/dentry to alter
312 * Erase the privilege-enhancing security markings on an inode.
314 * If the inode has been found through an idmapped mount the idmap of
315 * the vfsmount must be passed through @idmap. This function will then
316 * take care to map the inode according to @idmap before checking
317 * permissions. On non-idmapped mounts or if permission checking is to be
318 * performed on the raw inode simply pass @nop_mnt_idmap.
320 * Return: 0 if successful, -ve on error.
322 int cap_inode_killpriv(struct mnt_idmap
*idmap
, struct dentry
*dentry
)
326 error
= __vfs_removexattr(idmap
, dentry
, XATTR_NAME_CAPS
);
327 if (error
== -EOPNOTSUPP
)
332 static bool rootid_owns_currentns(vfsuid_t rootvfsuid
)
334 struct user_namespace
*ns
;
337 if (!vfsuid_valid(rootvfsuid
))
340 kroot
= vfsuid_into_kuid(rootvfsuid
);
341 for (ns
= current_user_ns();; ns
= ns
->parent
) {
342 if (from_kuid(ns
, kroot
) == 0)
344 if (ns
== &init_user_ns
)
351 static __u32
sansflags(__u32 m
)
353 return m
& ~VFS_CAP_FLAGS_EFFECTIVE
;
356 static bool is_v2header(int size
, const struct vfs_cap_data
*cap
)
358 if (size
!= XATTR_CAPS_SZ_2
)
360 return sansflags(le32_to_cpu(cap
->magic_etc
)) == VFS_CAP_REVISION_2
;
363 static bool is_v3header(int size
, const struct vfs_cap_data
*cap
)
365 if (size
!= XATTR_CAPS_SZ_3
)
367 return sansflags(le32_to_cpu(cap
->magic_etc
)) == VFS_CAP_REVISION_3
;
371 * getsecurity: We are called for security.* before any attempt to read the
372 * xattr from the inode itself.
374 * This gives us a chance to read the on-disk value and convert it. If we
375 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
377 * Note we are not called by vfs_getxattr_alloc(), but that is only called
378 * by the integrity subsystem, which really wants the unconverted values -
381 int cap_inode_getsecurity(struct mnt_idmap
*idmap
,
382 struct inode
*inode
, const char *name
, void **buffer
,
389 uid_t root
, mappedroot
;
391 struct vfs_cap_data
*cap
;
392 struct vfs_ns_cap_data
*nscap
= NULL
;
393 struct dentry
*dentry
;
394 struct user_namespace
*fs_ns
;
396 if (strcmp(name
, "capability") != 0)
399 dentry
= d_find_any_alias(inode
);
402 size
= vfs_getxattr_alloc(idmap
, dentry
, XATTR_NAME_CAPS
, &tmpbuf
,
403 sizeof(struct vfs_ns_cap_data
), GFP_NOFS
);
405 /* gcc11 complains if we don't check for !tmpbuf */
406 if (size
< 0 || !tmpbuf
)
409 fs_ns
= inode
->i_sb
->s_user_ns
;
410 cap
= (struct vfs_cap_data
*) tmpbuf
;
411 if (is_v2header(size
, cap
)) {
413 } else if (is_v3header(size
, cap
)) {
414 nscap
= (struct vfs_ns_cap_data
*) tmpbuf
;
415 root
= le32_to_cpu(nscap
->rootid
);
421 kroot
= make_kuid(fs_ns
, root
);
423 /* If this is an idmapped mount shift the kuid. */
424 vfsroot
= make_vfsuid(idmap
, fs_ns
, kroot
);
426 /* If the root kuid maps to a valid uid in current ns, then return
427 * this as a nscap. */
428 mappedroot
= from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot
));
429 if (mappedroot
!= (uid_t
)-1 && mappedroot
!= (uid_t
)0) {
430 size
= sizeof(struct vfs_ns_cap_data
);
433 /* v2 -> v3 conversion */
434 nscap
= kzalloc(size
, GFP_ATOMIC
);
439 nsmagic
= VFS_CAP_REVISION_3
;
440 magic
= le32_to_cpu(cap
->magic_etc
);
441 if (magic
& VFS_CAP_FLAGS_EFFECTIVE
)
442 nsmagic
|= VFS_CAP_FLAGS_EFFECTIVE
;
443 memcpy(&nscap
->data
, &cap
->data
, sizeof(__le32
) * 2 * VFS_CAP_U32
);
444 nscap
->magic_etc
= cpu_to_le32(nsmagic
);
446 /* use allocated v3 buffer */
449 nscap
->rootid
= cpu_to_le32(mappedroot
);
455 if (!rootid_owns_currentns(vfsroot
)) {
460 /* This comes from a parent namespace. Return as a v2 capability */
461 size
= sizeof(struct vfs_cap_data
);
464 /* v3 -> v2 conversion */
465 cap
= kzalloc(size
, GFP_ATOMIC
);
470 magic
= VFS_CAP_REVISION_2
;
471 nsmagic
= le32_to_cpu(nscap
->magic_etc
);
472 if (nsmagic
& VFS_CAP_FLAGS_EFFECTIVE
)
473 magic
|= VFS_CAP_FLAGS_EFFECTIVE
;
474 memcpy(&cap
->data
, &nscap
->data
, sizeof(__le32
) * 2 * VFS_CAP_U32
);
475 cap
->magic_etc
= cpu_to_le32(magic
);
477 /* use unconverted v2 */
488 * rootid_from_xattr - translate root uid of vfs caps
490 * @value: vfs caps value which may be modified by this function
491 * @size: size of @ivalue
492 * @task_ns: user namespace of the caller
494 static vfsuid_t
rootid_from_xattr(const void *value
, size_t size
,
495 struct user_namespace
*task_ns
)
497 const struct vfs_ns_cap_data
*nscap
= value
;
500 if (size
== XATTR_CAPS_SZ_3
)
501 rootid
= le32_to_cpu(nscap
->rootid
);
503 return VFSUIDT_INIT(make_kuid(task_ns
, rootid
));
506 static bool validheader(size_t size
, const struct vfs_cap_data
*cap
)
508 return is_v2header(size
, cap
) || is_v3header(size
, cap
);
512 * cap_convert_nscap - check vfs caps
514 * @idmap: idmap of the mount the inode was found from
515 * @dentry: used to retrieve inode to check permissions on
516 * @ivalue: vfs caps value which may be modified by this function
517 * @size: size of @ivalue
519 * User requested a write of security.capability. If needed, update the
520 * xattr to change from v2 to v3, or to fixup the v3 rootid.
522 * If the inode has been found through an idmapped mount the idmap of
523 * the vfsmount must be passed through @idmap. This function will then
524 * take care to map the inode according to @idmap before checking
525 * permissions. On non-idmapped mounts or if permission checking is to be
526 * performed on the raw inode simply pass @nop_mnt_idmap.
528 * Return: On success, return the new size; on error, return < 0.
530 int cap_convert_nscap(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
531 const void **ivalue
, size_t size
)
533 struct vfs_ns_cap_data
*nscap
;
535 const struct vfs_cap_data
*cap
= *ivalue
;
536 __u32 magic
, nsmagic
;
537 struct inode
*inode
= d_backing_inode(dentry
);
538 struct user_namespace
*task_ns
= current_user_ns(),
539 *fs_ns
= inode
->i_sb
->s_user_ns
;
546 if (!validheader(size
, cap
))
548 if (!capable_wrt_inode_uidgid(idmap
, inode
, CAP_SETFCAP
))
550 if (size
== XATTR_CAPS_SZ_2
&& (idmap
== &nop_mnt_idmap
))
551 if (ns_capable(inode
->i_sb
->s_user_ns
, CAP_SETFCAP
))
552 /* user is privileged, just write the v2 */
555 vfsrootid
= rootid_from_xattr(*ivalue
, size
, task_ns
);
556 if (!vfsuid_valid(vfsrootid
))
559 rootid
= from_vfsuid(idmap
, fs_ns
, vfsrootid
);
560 if (!uid_valid(rootid
))
563 nsrootid
= from_kuid(fs_ns
, rootid
);
567 newsize
= sizeof(struct vfs_ns_cap_data
);
568 nscap
= kmalloc(newsize
, GFP_ATOMIC
);
571 nscap
->rootid
= cpu_to_le32(nsrootid
);
572 nsmagic
= VFS_CAP_REVISION_3
;
573 magic
= le32_to_cpu(cap
->magic_etc
);
574 if (magic
& VFS_CAP_FLAGS_EFFECTIVE
)
575 nsmagic
|= VFS_CAP_FLAGS_EFFECTIVE
;
576 nscap
->magic_etc
= cpu_to_le32(nsmagic
);
577 memcpy(&nscap
->data
, &cap
->data
, sizeof(__le32
) * 2 * VFS_CAP_U32
);
584 * Calculate the new process capability sets from the capability sets attached
587 static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data
*caps
,
588 struct linux_binprm
*bprm
,
592 struct cred
*new = bprm
->cred
;
595 if (caps
->magic_etc
& VFS_CAP_FLAGS_EFFECTIVE
)
598 if (caps
->magic_etc
& VFS_CAP_REVISION_MASK
)
602 * pP' = (X & fP) | (pI & fI)
603 * The addition of pA' is handled later.
605 new->cap_permitted
.val
=
606 (new->cap_bset
.val
& caps
->permitted
.val
) |
607 (new->cap_inheritable
.val
& caps
->inheritable
.val
);
609 if (caps
->permitted
.val
& ~new->cap_permitted
.val
)
610 /* insufficient to execute correctly */
614 * For legacy apps, with no internal support for recognizing they
615 * do not have enough capabilities, we return an error if they are
616 * missing some "forced" (aka file-permitted) capabilities.
618 return *effective
? ret
: 0;
622 * get_vfs_caps_from_disk - retrieve vfs caps from disk
624 * @idmap: idmap of the mount the inode was found from
625 * @dentry: dentry from which @inode is retrieved
626 * @cpu_caps: vfs capabilities
628 * Extract the on-exec-apply capability sets for an executable file.
630 * If the inode has been found through an idmapped mount the idmap of
631 * the vfsmount must be passed through @idmap. This function will then
632 * take care to map the inode according to @idmap before checking
633 * permissions. On non-idmapped mounts or if permission checking is to be
634 * performed on the raw inode simply pass @nop_mnt_idmap.
636 int get_vfs_caps_from_disk(struct mnt_idmap
*idmap
,
637 const struct dentry
*dentry
,
638 struct cpu_vfs_cap_data
*cpu_caps
)
640 struct inode
*inode
= d_backing_inode(dentry
);
643 struct vfs_ns_cap_data data
, *nscaps
= &data
;
644 struct vfs_cap_data
*caps
= (struct vfs_cap_data
*) &data
;
647 struct user_namespace
*fs_ns
;
649 memset(cpu_caps
, 0, sizeof(struct cpu_vfs_cap_data
));
654 fs_ns
= inode
->i_sb
->s_user_ns
;
655 size
= __vfs_getxattr((struct dentry
*)dentry
, inode
,
656 XATTR_NAME_CAPS
, &data
, XATTR_CAPS_SZ
);
657 if (size
== -ENODATA
|| size
== -EOPNOTSUPP
)
658 /* no data, that's ok */
664 if (size
< sizeof(magic_etc
))
667 cpu_caps
->magic_etc
= magic_etc
= le32_to_cpu(caps
->magic_etc
);
669 rootkuid
= make_kuid(fs_ns
, 0);
670 switch (magic_etc
& VFS_CAP_REVISION_MASK
) {
671 case VFS_CAP_REVISION_1
:
672 if (size
!= XATTR_CAPS_SZ_1
)
675 case VFS_CAP_REVISION_2
:
676 if (size
!= XATTR_CAPS_SZ_2
)
679 case VFS_CAP_REVISION_3
:
680 if (size
!= XATTR_CAPS_SZ_3
)
682 rootkuid
= make_kuid(fs_ns
, le32_to_cpu(nscaps
->rootid
));
689 rootvfsuid
= make_vfsuid(idmap
, fs_ns
, rootkuid
);
690 if (!vfsuid_valid(rootvfsuid
))
693 /* Limit the caps to the mounter of the filesystem
694 * or the more limited uid specified in the xattr.
696 if (!rootid_owns_currentns(rootvfsuid
))
699 cpu_caps
->permitted
.val
= le32_to_cpu(caps
->data
[0].permitted
);
700 cpu_caps
->inheritable
.val
= le32_to_cpu(caps
->data
[0].inheritable
);
703 * Rev1 had just a single 32-bit word, later expanded
704 * to a second one for the high bits
706 if ((magic_etc
& VFS_CAP_REVISION_MASK
) != VFS_CAP_REVISION_1
) {
707 cpu_caps
->permitted
.val
+= (u64
)le32_to_cpu(caps
->data
[1].permitted
) << 32;
708 cpu_caps
->inheritable
.val
+= (u64
)le32_to_cpu(caps
->data
[1].inheritable
) << 32;
711 cpu_caps
->permitted
.val
&= CAP_VALID_MASK
;
712 cpu_caps
->inheritable
.val
&= CAP_VALID_MASK
;
714 cpu_caps
->rootid
= vfsuid_into_kuid(rootvfsuid
);
720 * Attempt to get the on-exec apply capability sets for an executable file from
721 * its xattrs and, if present, apply them to the proposed credentials being
722 * constructed by execve().
724 static int get_file_caps(struct linux_binprm
*bprm
, const struct file
*file
,
725 bool *effective
, bool *has_fcap
)
728 struct cpu_vfs_cap_data vcaps
;
730 cap_clear(bprm
->cred
->cap_permitted
);
732 if (!file_caps_enabled
)
735 if (!mnt_may_suid(file
->f_path
.mnt
))
739 * This check is redundant with mnt_may_suid() but is kept to make
740 * explicit that capability bits are limited to s_user_ns and its
743 if (!current_in_userns(file
->f_path
.mnt
->mnt_sb
->s_user_ns
))
746 rc
= get_vfs_caps_from_disk(file_mnt_idmap(file
),
747 file
->f_path
.dentry
, &vcaps
);
750 printk(KERN_NOTICE
"Invalid argument reading file caps for %s\n",
752 else if (rc
== -ENODATA
)
757 rc
= bprm_caps_from_vfs_caps(&vcaps
, bprm
, effective
, has_fcap
);
761 cap_clear(bprm
->cred
->cap_permitted
);
766 static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT
); }
768 static inline bool __is_real(kuid_t uid
, struct cred
*cred
)
769 { return uid_eq(cred
->uid
, uid
); }
771 static inline bool __is_eff(kuid_t uid
, struct cred
*cred
)
772 { return uid_eq(cred
->euid
, uid
); }
774 static inline bool __is_suid(kuid_t uid
, struct cred
*cred
)
775 { return !__is_real(uid
, cred
) && __is_eff(uid
, cred
); }
778 * handle_privileged_root - Handle case of privileged root
779 * @bprm: The execution parameters, including the proposed creds
780 * @has_fcap: Are any file capabilities set?
781 * @effective: Do we have effective root privilege?
782 * @root_uid: This namespace' root UID WRT initial USER namespace
784 * Handle the case where root is privileged and hasn't been neutered by
785 * SECURE_NOROOT. If file capabilities are set, they won't be combined with
786 * set UID root and nothing is changed. If we are root, cap_permitted is
787 * updated. If we have become set UID root, the effective bit is set.
789 static void handle_privileged_root(struct linux_binprm
*bprm
, bool has_fcap
,
790 bool *effective
, kuid_t root_uid
)
792 const struct cred
*old
= current_cred();
793 struct cred
*new = bprm
->cred
;
795 if (!root_privileged())
798 * If the legacy file capability is set, then don't set privs
799 * for a setuid root binary run by a non-root user. Do set it
800 * for a root user just to cause least surprise to an admin.
802 if (has_fcap
&& __is_suid(root_uid
, new)) {
803 warn_setuid_and_fcaps_mixed(bprm
->filename
);
807 * To support inheritance of root-permissions and suid-root
808 * executables under compatibility mode, we override the
809 * capability sets for the file.
811 if (__is_eff(root_uid
, new) || __is_real(root_uid
, new)) {
812 /* pP' = (cap_bset & ~0) | (pI & ~0) */
813 new->cap_permitted
= cap_combine(old
->cap_bset
,
814 old
->cap_inheritable
);
817 * If only the real uid is 0, we do not set the effective bit.
819 if (__is_eff(root_uid
, new))
823 #define __cap_gained(field, target, source) \
824 !cap_issubset(target->cap_##field, source->cap_##field)
825 #define __cap_grew(target, source, cred) \
826 !cap_issubset(cred->cap_##target, cred->cap_##source)
827 #define __cap_full(field, cred) \
828 cap_issubset(CAP_FULL_SET, cred->cap_##field)
830 static inline bool __is_setuid(struct cred
*new, const struct cred
*old
)
831 { return !uid_eq(new->euid
, old
->uid
); }
833 static inline bool __is_setgid(struct cred
*new, const struct cred
*old
)
834 { return !gid_eq(new->egid
, old
->gid
); }
837 * 1) Audit candidate if current->cap_effective is set
839 * We do not bother to audit if 3 things are true:
840 * 1) cap_effective has all caps
841 * 2) we became root *OR* are were already root
842 * 3) root is supposed to have all caps (SECURE_NOROOT)
843 * Since this is just a normal root execing a process.
845 * Number 1 above might fail if you don't have a full bset, but I think
846 * that is interesting information to audit.
848 * A number of other conditions require logging:
849 * 2) something prevented setuid root getting all caps
850 * 3) non-setuid root gets fcaps
851 * 4) non-setuid root gets ambient
853 static inline bool nonroot_raised_pE(struct cred
*new, const struct cred
*old
,
854 kuid_t root
, bool has_fcap
)
858 if ((__cap_grew(effective
, ambient
, new) &&
859 !(__cap_full(effective
, new) &&
860 (__is_eff(root
, new) || __is_real(root
, new)) &&
861 root_privileged())) ||
862 (root_privileged() &&
863 __is_suid(root
, new) &&
864 !__cap_full(effective
, new)) ||
865 (!__is_setuid(new, old
) &&
867 __cap_gained(permitted
, new, old
)) ||
868 __cap_gained(ambient
, new, old
))))
876 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
877 * @bprm: The execution parameters, including the proposed creds
878 * @file: The file to pull the credentials from
880 * Set up the proposed credentials for a new execution context being
881 * constructed by execve(). The proposed creds in @bprm->cred is altered,
882 * which won't take effect immediately.
884 * Return: 0 if successful, -ve on error.
886 int cap_bprm_creds_from_file(struct linux_binprm
*bprm
, const struct file
*file
)
888 /* Process setpcap binaries and capabilities for uid 0 */
889 const struct cred
*old
= current_cred();
890 struct cred
*new = bprm
->cred
;
891 bool effective
= false, has_fcap
= false, is_setid
;
895 if (WARN_ON(!cap_ambient_invariant_ok(old
)))
898 ret
= get_file_caps(bprm
, file
, &effective
, &has_fcap
);
902 root_uid
= make_kuid(new->user_ns
, 0);
904 handle_privileged_root(bprm
, has_fcap
, &effective
, root_uid
);
906 /* if we have fs caps, clear dangerous personality flags */
907 if (__cap_gained(permitted
, new, old
))
908 bprm
->per_clear
|= PER_CLEAR_ON_SETID
;
910 /* Don't let someone trace a set[ug]id/setpcap binary with the revised
911 * credentials unless they have the appropriate permit.
913 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
915 is_setid
= __is_setuid(new, old
) || __is_setgid(new, old
);
917 if ((is_setid
|| __cap_gained(permitted
, new, old
)) &&
918 ((bprm
->unsafe
& ~LSM_UNSAFE_PTRACE
) ||
919 !ptracer_capable(current
, new->user_ns
))) {
920 /* downgrade; they get no more than they had, and maybe less */
921 if (!ns_capable(new->user_ns
, CAP_SETUID
) ||
922 (bprm
->unsafe
& LSM_UNSAFE_NO_NEW_PRIVS
)) {
923 new->euid
= new->uid
;
924 new->egid
= new->gid
;
926 new->cap_permitted
= cap_intersect(new->cap_permitted
,
930 new->suid
= new->fsuid
= new->euid
;
931 new->sgid
= new->fsgid
= new->egid
;
933 /* File caps or setid cancels ambient. */
934 if (has_fcap
|| is_setid
)
935 cap_clear(new->cap_ambient
);
938 * Now that we've computed pA', update pP' to give:
939 * pP' = (X & fP) | (pI & fI) | pA'
941 new->cap_permitted
= cap_combine(new->cap_permitted
, new->cap_ambient
);
944 * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
945 * this is the same as pE' = (fE ? pP' : 0) | pA'.
948 new->cap_effective
= new->cap_permitted
;
950 new->cap_effective
= new->cap_ambient
;
952 if (WARN_ON(!cap_ambient_invariant_ok(new)))
955 if (nonroot_raised_pE(new, old
, root_uid
, has_fcap
)) {
956 ret
= audit_log_bprm_fcaps(bprm
, new, old
);
961 new->securebits
&= ~issecure_mask(SECURE_KEEP_CAPS
);
963 if (WARN_ON(!cap_ambient_invariant_ok(new)))
966 /* Check for privilege-elevated exec. */
968 (!__is_real(root_uid
, new) &&
970 __cap_grew(permitted
, ambient
, new))))
971 bprm
->secureexec
= 1;
977 * cap_inode_setxattr - Determine whether an xattr may be altered
978 * @dentry: The inode/dentry being altered
979 * @name: The name of the xattr to be changed
980 * @value: The value that the xattr will be changed to
981 * @size: The size of value
982 * @flags: The replacement flag
984 * Determine whether an xattr may be altered or set on an inode, returning 0 if
985 * permission is granted, -ve if denied.
987 * This is used to make sure security xattrs don't get updated or set by those
988 * who aren't privileged to do so.
990 int cap_inode_setxattr(struct dentry
*dentry
, const char *name
,
991 const void *value
, size_t size
, int flags
)
993 struct user_namespace
*user_ns
= dentry
->d_sb
->s_user_ns
;
995 /* Ignore non-security xattrs */
996 if (strncmp(name
, XATTR_SECURITY_PREFIX
,
997 XATTR_SECURITY_PREFIX_LEN
) != 0)
1001 * For XATTR_NAME_CAPS the check will be done in
1002 * cap_convert_nscap(), called by setxattr()
1004 if (strcmp(name
, XATTR_NAME_CAPS
) == 0)
1007 if (!ns_capable(user_ns
, CAP_SYS_ADMIN
))
1013 * cap_inode_removexattr - Determine whether an xattr may be removed
1015 * @idmap: idmap of the mount the inode was found from
1016 * @dentry: The inode/dentry being altered
1017 * @name: The name of the xattr to be changed
1019 * Determine whether an xattr may be removed from an inode, returning 0 if
1020 * permission is granted, -ve if denied.
1022 * If the inode has been found through an idmapped mount the idmap of
1023 * the vfsmount must be passed through @idmap. This function will then
1024 * take care to map the inode according to @idmap before checking
1025 * permissions. On non-idmapped mounts or if permission checking is to be
1026 * performed on the raw inode simply pass @nop_mnt_idmap.
1028 * This is used to make sure security xattrs don't get removed by those who
1029 * aren't privileged to remove them.
1031 int cap_inode_removexattr(struct mnt_idmap
*idmap
,
1032 struct dentry
*dentry
, const char *name
)
1034 struct user_namespace
*user_ns
= dentry
->d_sb
->s_user_ns
;
1036 /* Ignore non-security xattrs */
1037 if (strncmp(name
, XATTR_SECURITY_PREFIX
,
1038 XATTR_SECURITY_PREFIX_LEN
) != 0)
1041 if (strcmp(name
, XATTR_NAME_CAPS
) == 0) {
1042 /* security.capability gets namespaced */
1043 struct inode
*inode
= d_backing_inode(dentry
);
1046 if (!capable_wrt_inode_uidgid(idmap
, inode
, CAP_SETFCAP
))
1051 if (!ns_capable(user_ns
, CAP_SYS_ADMIN
))
1057 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
1058 * a process after a call to setuid, setreuid, or setresuid.
1060 * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
1061 * {r,e,s}uid != 0, the permitted and effective capabilities are
1064 * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1065 * capabilities of the process are cleared.
1067 * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1068 * capabilities are set to the permitted capabilities.
1070 * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1075 * cevans - New behaviour, Oct '99
1076 * A process may, via prctl(), elect to keep its capabilities when it
1077 * calls setuid() and switches away from uid==0. Both permitted and
1078 * effective sets will be retained.
1079 * Without this change, it was impossible for a daemon to drop only some
1080 * of its privilege. The call to setuid(!=0) would drop all privileges!
1081 * Keeping uid 0 is not an option because uid 0 owns too many vital
1083 * Thanks to Olaf Kirch and Peter Benie for spotting this.
1085 static inline void cap_emulate_setxuid(struct cred
*new, const struct cred
*old
)
1087 kuid_t root_uid
= make_kuid(old
->user_ns
, 0);
1089 if ((uid_eq(old
->uid
, root_uid
) ||
1090 uid_eq(old
->euid
, root_uid
) ||
1091 uid_eq(old
->suid
, root_uid
)) &&
1092 (!uid_eq(new->uid
, root_uid
) &&
1093 !uid_eq(new->euid
, root_uid
) &&
1094 !uid_eq(new->suid
, root_uid
))) {
1095 if (!issecure(SECURE_KEEP_CAPS
)) {
1096 cap_clear(new->cap_permitted
);
1097 cap_clear(new->cap_effective
);
1101 * Pre-ambient programs expect setresuid to nonroot followed
1102 * by exec to drop capabilities. We should make sure that
1103 * this remains the case.
1105 cap_clear(new->cap_ambient
);
1107 if (uid_eq(old
->euid
, root_uid
) && !uid_eq(new->euid
, root_uid
))
1108 cap_clear(new->cap_effective
);
1109 if (!uid_eq(old
->euid
, root_uid
) && uid_eq(new->euid
, root_uid
))
1110 new->cap_effective
= new->cap_permitted
;
1114 * cap_task_fix_setuid - Fix up the results of setuid() call
1115 * @new: The proposed credentials
1116 * @old: The current task's current credentials
1117 * @flags: Indications of what has changed
1119 * Fix up the results of setuid() call before the credential changes are
1122 * Return: 0 to grant the changes, -ve to deny them.
1124 int cap_task_fix_setuid(struct cred
*new, const struct cred
*old
, int flags
)
1130 /* juggle the capabilities to follow [RES]UID changes unless
1131 * otherwise suppressed */
1132 if (!issecure(SECURE_NO_SETUID_FIXUP
))
1133 cap_emulate_setxuid(new, old
);
1137 /* juggle the capabilities to follow FSUID changes, unless
1138 * otherwise suppressed
1140 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1141 * if not, we might be a bit too harsh here.
1143 if (!issecure(SECURE_NO_SETUID_FIXUP
)) {
1144 kuid_t root_uid
= make_kuid(old
->user_ns
, 0);
1145 if (uid_eq(old
->fsuid
, root_uid
) && !uid_eq(new->fsuid
, root_uid
))
1146 new->cap_effective
=
1147 cap_drop_fs_set(new->cap_effective
);
1149 if (!uid_eq(old
->fsuid
, root_uid
) && uid_eq(new->fsuid
, root_uid
))
1150 new->cap_effective
=
1151 cap_raise_fs_set(new->cap_effective
,
1152 new->cap_permitted
);
1164 * Rationale: code calling task_setscheduler, task_setioprio, and
1165 * task_setnice, assumes that
1166 * . if capable(cap_sys_nice), then those actions should be allowed
1167 * . if not capable(cap_sys_nice), but acting on your own processes,
1168 * then those actions should be allowed
1169 * This is insufficient now since you can call code without suid, but
1170 * yet with increased caps.
1171 * So we check for increased caps on the target process.
1173 static int cap_safe_nice(struct task_struct
*p
)
1175 int is_subset
, ret
= 0;
1178 is_subset
= cap_issubset(__task_cred(p
)->cap_permitted
,
1179 current_cred()->cap_permitted
);
1180 if (!is_subset
&& !ns_capable(__task_cred(p
)->user_ns
, CAP_SYS_NICE
))
1188 * cap_task_setscheduler - Determine if scheduler policy change is permitted
1189 * @p: The task to affect
1191 * Determine if the requested scheduler policy change is permitted for the
1194 * Return: 0 if permission is granted, -ve if denied.
1196 int cap_task_setscheduler(struct task_struct
*p
)
1198 return cap_safe_nice(p
);
1202 * cap_task_setioprio - Determine if I/O priority change is permitted
1203 * @p: The task to affect
1204 * @ioprio: The I/O priority to set
1206 * Determine if the requested I/O priority change is permitted for the specified
1209 * Return: 0 if permission is granted, -ve if denied.
1211 int cap_task_setioprio(struct task_struct
*p
, int ioprio
)
1213 return cap_safe_nice(p
);
1217 * cap_task_setnice - Determine if task priority change is permitted
1218 * @p: The task to affect
1219 * @nice: The nice value to set
1221 * Determine if the requested task priority change is permitted for the
1224 * Return: 0 if permission is granted, -ve if denied.
1226 int cap_task_setnice(struct task_struct
*p
, int nice
)
1228 return cap_safe_nice(p
);
1232 * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from
1233 * the current task's bounding set. Returns 0 on success, -ve on error.
1235 static int cap_prctl_drop(unsigned long cap
)
1239 if (!ns_capable(current_user_ns(), CAP_SETPCAP
))
1241 if (!cap_valid(cap
))
1244 new = prepare_creds();
1247 cap_lower(new->cap_bset
, cap
);
1248 return commit_creds(new);
1252 * cap_task_prctl - Implement process control functions for this security module
1253 * @option: The process control function requested
1254 * @arg2: The argument data for this function
1255 * @arg3: The argument data for this function
1256 * @arg4: The argument data for this function
1257 * @arg5: The argument data for this function
1259 * Allow process control functions (sys_prctl()) to alter capabilities; may
1260 * also deny access to other functions not otherwise implemented here.
1262 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
1263 * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM
1264 * modules will consider performing the function.
1266 int cap_task_prctl(int option
, unsigned long arg2
, unsigned long arg3
,
1267 unsigned long arg4
, unsigned long arg5
)
1269 const struct cred
*old
= current_cred();
1273 case PR_CAPBSET_READ
:
1274 if (!cap_valid(arg2
))
1276 return !!cap_raised(old
->cap_bset
, arg2
);
1278 case PR_CAPBSET_DROP
:
1279 return cap_prctl_drop(arg2
);
1282 * The next four prctl's remain to assist with transitioning a
1283 * system from legacy UID=0 based privilege (when filesystem
1284 * capabilities are not in use) to a system using filesystem
1285 * capabilities only - as the POSIX.1e draft intended.
1289 * PR_SET_SECUREBITS =
1290 * issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1291 * | issecure_mask(SECURE_NOROOT)
1292 * | issecure_mask(SECURE_NOROOT_LOCKED)
1293 * | issecure_mask(SECURE_NO_SETUID_FIXUP)
1294 * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1296 * will ensure that the current process and all of its
1297 * children will be locked into a pure
1298 * capability-based-privilege environment.
1300 case PR_SET_SECUREBITS
:
1301 if ((((old
->securebits
& SECURE_ALL_LOCKS
) >> 1)
1302 & (old
->securebits
^ arg2
)) /*[1]*/
1303 || ((old
->securebits
& SECURE_ALL_LOCKS
& ~arg2
)) /*[2]*/
1304 || (arg2
& ~(SECURE_ALL_LOCKS
| SECURE_ALL_BITS
)) /*[3]*/
1305 || (cap_capable(current_cred(),
1306 current_cred()->user_ns
,
1308 CAP_OPT_NONE
) != 0) /*[4]*/
1310 * [1] no changing of bits that are locked
1311 * [2] no unlocking of locks
1312 * [3] no setting of unsupported bits
1313 * [4] doing anything requires privilege (go read about
1314 * the "sendmail capabilities bug")
1317 /* cannot change a locked bit */
1320 new = prepare_creds();
1323 new->securebits
= arg2
;
1324 return commit_creds(new);
1326 case PR_GET_SECUREBITS
:
1327 return old
->securebits
;
1329 case PR_GET_KEEPCAPS
:
1330 return !!issecure(SECURE_KEEP_CAPS
);
1332 case PR_SET_KEEPCAPS
:
1333 if (arg2
> 1) /* Note, we rely on arg2 being unsigned here */
1335 if (issecure(SECURE_KEEP_CAPS_LOCKED
))
1338 new = prepare_creds();
1342 new->securebits
|= issecure_mask(SECURE_KEEP_CAPS
);
1344 new->securebits
&= ~issecure_mask(SECURE_KEEP_CAPS
);
1345 return commit_creds(new);
1347 case PR_CAP_AMBIENT
:
1348 if (arg2
== PR_CAP_AMBIENT_CLEAR_ALL
) {
1349 if (arg3
| arg4
| arg5
)
1352 new = prepare_creds();
1355 cap_clear(new->cap_ambient
);
1356 return commit_creds(new);
1359 if (((!cap_valid(arg3
)) | arg4
| arg5
))
1362 if (arg2
== PR_CAP_AMBIENT_IS_SET
) {
1363 return !!cap_raised(current_cred()->cap_ambient
, arg3
);
1364 } else if (arg2
!= PR_CAP_AMBIENT_RAISE
&&
1365 arg2
!= PR_CAP_AMBIENT_LOWER
) {
1368 if (arg2
== PR_CAP_AMBIENT_RAISE
&&
1369 (!cap_raised(current_cred()->cap_permitted
, arg3
) ||
1370 !cap_raised(current_cred()->cap_inheritable
,
1372 issecure(SECURE_NO_CAP_AMBIENT_RAISE
)))
1375 new = prepare_creds();
1378 if (arg2
== PR_CAP_AMBIENT_RAISE
)
1379 cap_raise(new->cap_ambient
, arg3
);
1381 cap_lower(new->cap_ambient
, arg3
);
1382 return commit_creds(new);
1386 /* No functionality available - continue with default */
1392 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1393 * @mm: The VM space in which the new mapping is to be made
1394 * @pages: The size of the mapping
1396 * Determine whether the allocation of a new virtual mapping by the current
1397 * task is permitted.
1399 * Return: 0 if permission granted, negative error code if not.
1401 int cap_vm_enough_memory(struct mm_struct
*mm
, long pages
)
1403 return cap_capable(current_cred(), &init_user_ns
, CAP_SYS_ADMIN
,
1408 * cap_mmap_addr - check if able to map given addr
1409 * @addr: address attempting to be mapped
1411 * If the process is attempting to map memory below dac_mmap_min_addr they need
1412 * CAP_SYS_RAWIO. The other parameters to this function are unused by the
1413 * capability security module.
1415 * Return: 0 if this mapping should be allowed or -EPERM if not.
1417 int cap_mmap_addr(unsigned long addr
)
1421 if (addr
< dac_mmap_min_addr
) {
1422 ret
= cap_capable(current_cred(), &init_user_ns
, CAP_SYS_RAWIO
,
1424 /* set PF_SUPERPRIV if it turns out we allow the low mmap */
1426 current
->flags
|= PF_SUPERPRIV
;
1431 int cap_mmap_file(struct file
*file
, unsigned long reqprot
,
1432 unsigned long prot
, unsigned long flags
)
1437 #ifdef CONFIG_SECURITY
1439 static const struct lsm_id capability_lsmid
= {
1440 .name
= "capability",
1441 .id
= LSM_ID_CAPABILITY
,
1444 static struct security_hook_list capability_hooks
[] __ro_after_init
= {
1445 LSM_HOOK_INIT(capable
, cap_capable
),
1446 LSM_HOOK_INIT(settime
, cap_settime
),
1447 LSM_HOOK_INIT(ptrace_access_check
, cap_ptrace_access_check
),
1448 LSM_HOOK_INIT(ptrace_traceme
, cap_ptrace_traceme
),
1449 LSM_HOOK_INIT(capget
, cap_capget
),
1450 LSM_HOOK_INIT(capset
, cap_capset
),
1451 LSM_HOOK_INIT(bprm_creds_from_file
, cap_bprm_creds_from_file
),
1452 LSM_HOOK_INIT(inode_need_killpriv
, cap_inode_need_killpriv
),
1453 LSM_HOOK_INIT(inode_killpriv
, cap_inode_killpriv
),
1454 LSM_HOOK_INIT(inode_getsecurity
, cap_inode_getsecurity
),
1455 LSM_HOOK_INIT(mmap_addr
, cap_mmap_addr
),
1456 LSM_HOOK_INIT(mmap_file
, cap_mmap_file
),
1457 LSM_HOOK_INIT(task_fix_setuid
, cap_task_fix_setuid
),
1458 LSM_HOOK_INIT(task_prctl
, cap_task_prctl
),
1459 LSM_HOOK_INIT(task_setscheduler
, cap_task_setscheduler
),
1460 LSM_HOOK_INIT(task_setioprio
, cap_task_setioprio
),
1461 LSM_HOOK_INIT(task_setnice
, cap_task_setnice
),
1462 LSM_HOOK_INIT(vm_enough_memory
, cap_vm_enough_memory
),
1465 static int __init
capability_init(void)
1467 security_add_hooks(capability_hooks
, ARRAY_SIZE(capability_hooks
),
1472 DEFINE_LSM(capability
) = {
1473 .name
= "capability",
1474 .order
= LSM_ORDER_FIRST
,
1475 .init
= capability_init
,
1478 #endif /* CONFIG_SECURITY */