1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "sandbox/linux/services/credentials.h"
11 #include <sys/syscall.h>
12 #include <sys/types.h>
16 #include "base/bind.h"
17 #include "base/files/file_path.h"
18 #include "base/files/file_util.h"
19 #include "base/logging.h"
20 #include "base/posix/eintr_wrapper.h"
21 #include "base/process/launch.h"
22 #include "base/template_util.h"
23 #include "base/third_party/valgrind/valgrind.h"
24 #include "build/build_config.h"
25 #include "sandbox/linux/services/namespace_utils.h"
26 #include "sandbox/linux/services/proc_util.h"
27 #include "sandbox/linux/services/syscall_wrappers.h"
28 #include "sandbox/linux/services/thread_helpers.h"
29 #include "sandbox/linux/system_headers/capability.h"
30 #include "sandbox/linux/system_headers/linux_signal.h"
36 bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND
; }
38 // Checks that the set of RES-uids and the set of RES-gids have
39 // one element each and return that element in |resuid| and |resgid|
40 // respectively. It's ok to pass NULL as one or both of the ids.
41 bool GetRESIds(uid_t
* resuid
, gid_t
* resgid
) {
42 uid_t ruid
, euid
, suid
;
43 gid_t rgid
, egid
, sgid
;
44 PCHECK(sys_getresuid(&ruid
, &euid
, &suid
) == 0);
45 PCHECK(sys_getresgid(&rgid
, &egid
, &sgid
) == 0);
46 const bool uids_are_equal
= (ruid
== euid
) && (ruid
== suid
);
47 const bool gids_are_equal
= (rgid
== egid
) && (rgid
== sgid
);
48 if (!uids_are_equal
|| !gids_are_equal
) return false;
49 if (resuid
) *resuid
= euid
;
50 if (resgid
) *resgid
= egid
;
54 const int kExitSuccess
= 0;
56 #if defined(__clang__)
57 // Disable sanitizers that rely on TLS and may write to non-stack memory.
58 __attribute__((no_sanitize_address
))
59 __attribute__((no_sanitize_thread
))
60 __attribute__((no_sanitize_memory
))
62 int ChrootToSelfFdinfo(void*) {
63 // This function can be run from a vforked child, so it should not write to
64 // any memory other than the stack or errno. Reads from TLS may be different
65 // from in the parent process.
66 RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);
68 // CWD is essentially an implicit file descriptor, so be careful to not
70 RAW_CHECK(chdir("/") == 0);
74 // chroot() to an empty dir that is "safe". To be safe, it must not contain
75 // any subdirectory (chroot-ing there would allow a chroot escape) and it must
76 // be impossible to create an empty directory there.
77 // We achieve this by doing the following:
78 // 1. We create a new process sharing file system information.
79 // 2. In the child, we chroot to /proc/self/fdinfo/
80 // This is already "safe", since fdinfo/ does not contain another directory and
81 // one cannot create another directory there.
82 // 3. The process dies
83 // After (3) happens, the directory is not available anymore in /proc.
84 bool ChrootToSafeEmptyDir() {
85 // We need to chroot to a fdinfo that is unique to a process and have that
87 // 1. We don't want to simply fork() because duplicating the page tables is
88 // slow with a big address space.
89 // 2. We do not use a regular thread (that would unshare CLONE_FILES) because
90 // when we are in a PID namespace, we cannot easily get a handle to the
91 // /proc/tid directory for the thread (since /proc may not be aware of the
92 // PID namespace). With a process, we can just use /proc/self.
94 char stack_buf
[PTHREAD_STACK_MIN
];
95 #if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
96 defined(ARCH_CPU_MIPS64_FAMILY) || defined(ARCH_CPU_MIPS_FAMILY)
97 // The stack grows downward.
98 void* stack
= stack_buf
+ sizeof(stack_buf
);
100 #error "Unsupported architecture"
103 int clone_flags
= CLONE_FS
| LINUX_SIGCHLD
;
105 #if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)
106 // Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
107 // Since clone writes to the new child's TLS before returning, we must set a
108 // new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
109 // glibc performs syscalls by calling a function pointer in TLS, so we do not
110 // attempt this optimization.
111 clone_flags
|= CLONE_VM
| CLONE_VFORK
| CLONE_SETTLS
;
113 char tls_buf
[PTHREAD_STACK_MIN
] = {0};
117 pid
= clone(ChrootToSelfFdinfo
, stack
, clone_flags
, nullptr, nullptr, tls
,
122 PCHECK(HANDLE_EINTR(waitpid(pid
, &status
, 0)) == pid
);
124 return WIFEXITED(status
) && WEXITSTATUS(status
) == kExitSuccess
;
127 // CHECK() that an attempt to move to a new user namespace raised an expected
129 void CheckCloneNewUserErrno(int error
) {
130 // EPERM can happen if already in a chroot. EUSERS if too many nested
131 // namespaces are used. EINVAL for kernels that don't support the feature.
132 // Valgrind will ENOSYS unshare().
133 PCHECK(error
== EPERM
|| error
== EUSERS
|| error
== EINVAL
||
137 // Converts a Capability to the corresponding Linux CAP_XXX value.
138 int CapabilityToKernelValue(Credentials::Capability cap
) {
140 case Credentials::Capability::SYS_CHROOT
:
141 return CAP_SYS_CHROOT
;
142 case Credentials::Capability::SYS_ADMIN
:
143 return CAP_SYS_ADMIN
;
146 LOG(FATAL
) << "Invalid Capability: " << static_cast<int>(cap
);
153 bool Credentials::DropAllCapabilities(int proc_fd
) {
154 if (!SetCapabilities(proc_fd
, std::vector
<Capability
>())) {
158 CHECK(!HasAnyCapability());
163 bool Credentials::DropAllCapabilities() {
164 base::ScopedFD
proc_fd(ProcUtil::OpenProc());
165 return Credentials::DropAllCapabilities(proc_fd
.get());
169 bool Credentials::DropAllCapabilitiesOnCurrentThread() {
170 return SetCapabilitiesOnCurrentThread(std::vector
<Capability
>());
174 bool Credentials::SetCapabilitiesOnCurrentThread(
175 const std::vector
<Capability
>& caps
) {
176 struct cap_hdr hdr
= {};
177 hdr
.version
= _LINUX_CAPABILITY_VERSION_3
;
178 struct cap_data data
[_LINUX_CAPABILITY_U32S_3
] = {{}};
180 // Initially, cap has no capability flags set. Enable the effective and
181 // permitted flags only for the requested capabilities.
182 for (const Capability cap
: caps
) {
183 const int cap_num
= CapabilityToKernelValue(cap
);
184 const size_t index
= CAP_TO_INDEX(cap_num
);
185 const uint32_t mask
= CAP_TO_MASK(cap_num
);
186 data
[index
].effective
|= mask
;
187 data
[index
].permitted
|= mask
;
190 return sys_capset(&hdr
, data
) == 0;
194 bool Credentials::SetCapabilities(int proc_fd
,
195 const std::vector
<Capability
>& caps
) {
196 DCHECK_LE(0, proc_fd
);
198 #if !defined(THREAD_SANITIZER)
199 // With TSAN, accept to break the security model as it is a testing
201 CHECK(ThreadHelpers::IsSingleThreaded(proc_fd
));
204 return SetCapabilitiesOnCurrentThread(caps
);
207 bool Credentials::HasAnyCapability() {
208 struct cap_hdr hdr
= {};
209 hdr
.version
= _LINUX_CAPABILITY_VERSION_3
;
210 struct cap_data data
[_LINUX_CAPABILITY_U32S_3
] = {{}};
212 PCHECK(sys_capget(&hdr
, data
) == 0);
214 for (size_t i
= 0; i
< arraysize(data
); ++i
) {
215 if (data
[i
].effective
|| data
[i
].permitted
|| data
[i
].inheritable
) {
223 bool Credentials::HasCapability(Capability cap
) {
224 struct cap_hdr hdr
= {};
225 hdr
.version
= _LINUX_CAPABILITY_VERSION_3
;
226 struct cap_data data
[_LINUX_CAPABILITY_U32S_3
] = {{}};
228 PCHECK(sys_capget(&hdr
, data
) == 0);
230 const int cap_num
= CapabilityToKernelValue(cap
);
231 const size_t index
= CAP_TO_INDEX(cap_num
);
232 const uint32_t mask
= CAP_TO_MASK(cap_num
);
234 return (data
[index
].effective
| data
[index
].permitted
|
235 data
[index
].inheritable
) &
240 bool Credentials::CanCreateProcessInNewUserNS() {
241 // Valgrind will let clone(2) pass-through, but doesn't support unshare(),
242 // so always consider UserNS unsupported there.
243 if (IsRunningOnValgrind()) {
247 #if defined(THREAD_SANITIZER)
248 // With TSAN, processes will always have threads running and can never
249 // enter a new user namespace with MoveToNewUserNS().
253 // This is roughly a fork().
254 const pid_t pid
= sys_clone(CLONE_NEWUSER
| SIGCHLD
, 0, 0, 0, 0);
257 CheckCloneNewUserErrno(errno
);
261 // The parent process could have had threads. In the child, these threads
262 // have disappeared. Make sure to not do anything in the child, as this is a
263 // fragile execution environment.
268 // Always reap the child.
270 PCHECK(HANDLE_EINTR(waitpid(pid
, &status
, 0)) == pid
);
271 CHECK(WIFEXITED(status
));
272 CHECK_EQ(kExitSuccess
, WEXITSTATUS(status
));
274 // clone(2) succeeded, we can use CLONE_NEWUSER.
278 bool Credentials::MoveToNewUserNS() {
281 if (!GetRESIds(&uid
, &gid
)) {
282 // If all the uids (or gids) are not equal to each other, the security
283 // model will most likely confuse the caller, abort.
284 DVLOG(1) << "uids or gids differ!";
287 int ret
= sys_unshare(CLONE_NEWUSER
);
289 const int unshare_errno
= errno
;
290 VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
291 << "on this kernel.";
292 CheckCloneNewUserErrno(unshare_errno
);
296 if (NamespaceUtils::KernelSupportsDenySetgroups()) {
297 PCHECK(NamespaceUtils::DenySetgroups());
300 // The current {r,e,s}{u,g}id is now an overflow id (c.f.
301 // /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
302 DCHECK(GetRESIds(NULL
, NULL
));
303 const char kGidMapFile
[] = "/proc/self/gid_map";
304 const char kUidMapFile
[] = "/proc/self/uid_map";
305 PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile
, gid
));
306 PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile
, uid
));
307 DCHECK(GetRESIds(NULL
, NULL
));
311 bool Credentials::DropFileSystemAccess(int proc_fd
) {
312 CHECK_LE(0, proc_fd
);
314 CHECK(ChrootToSafeEmptyDir());
315 CHECK(!base::DirectoryExists(base::FilePath("/proc")));
316 CHECK(!ProcUtil::HasOpenDirectory(proc_fd
));
317 // We never let this function fail.
321 pid_t
Credentials::ForkAndDropCapabilitiesInChild() {
327 // Since we just forked, we are single threaded.
328 PCHECK(DropAllCapabilitiesOnCurrentThread());
332 } // namespace sandbox.