1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
7 // Some headers on Android are missing cdefs: crbug.com/172337.
8 // (We can't use OS_ANDROID here since build_config.h is not included).
10 #include <sys/cdefs.h>
16 #include <sys/prctl.h>
18 #include <sys/syscall.h>
19 #include <sys/types.h>
23 #include "base/compiler_specific.h"
24 #include "base/logging.h"
25 #include "base/macros.h"
26 #include "base/memory/scoped_ptr.h"
27 #include "base/posix/eintr_wrapper.h"
28 #include "sandbox/linux/seccomp-bpf/codegen.h"
29 #include "sandbox/linux/seccomp-bpf/sandbox_bpf_policy.h"
30 #include "sandbox/linux/seccomp-bpf/syscall.h"
31 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"
32 #include "sandbox/linux/seccomp-bpf/verifier.h"
33 #include "sandbox/linux/services/linux_syscalls.h"
39 const int kExpectedExitCode
= 100;
41 int popcount(uint32_t x
) {
42 return __builtin_popcount(x
);
46 void WriteFailedStderrSetupMessage(int out_fd
) {
47 const char* error_string
= strerror(errno
);
48 static const char msg
[] =
49 "You have reproduced a puzzling issue.\n"
50 "Please, report to crbug.com/152530!\n"
51 "Failed to set up stderr: ";
52 if (HANDLE_EINTR(write(out_fd
, msg
, sizeof(msg
) - 1)) > 0 && error_string
&&
53 HANDLE_EINTR(write(out_fd
, error_string
, strlen(error_string
))) > 0 &&
54 HANDLE_EINTR(write(out_fd
, "\n", 1))) {
57 #endif // !defined(NDEBUG)
59 // We define a really simple sandbox policy. It is just good enough for us
60 // to tell that the sandbox has actually been activated.
61 class ProbePolicy
: public SandboxBPFPolicy
{
64 virtual ErrorCode
EvaluateSyscall(SandboxBPF
*, int sysnum
) const OVERRIDE
{
67 // Return EPERM so that we can check that the filter actually ran.
68 return ErrorCode(EPERM
);
70 // Allow exit() with a non-default return code.
71 return ErrorCode(ErrorCode::ERR_ALLOWED
);
73 // Make everything else fail in an easily recognizable way.
74 return ErrorCode(EINVAL
);
79 DISALLOW_COPY_AND_ASSIGN(ProbePolicy
);
82 void ProbeProcess(void) {
83 if (syscall(__NR_getpid
) < 0 && errno
== EPERM
) {
84 syscall(__NR_exit_group
, static_cast<intptr_t>(kExpectedExitCode
));
88 class AllowAllPolicy
: public SandboxBPFPolicy
{
91 virtual ErrorCode
EvaluateSyscall(SandboxBPF
*, int sysnum
) const OVERRIDE
{
92 DCHECK(SandboxBPF::IsValidSyscallNumber(sysnum
));
93 return ErrorCode(ErrorCode::ERR_ALLOWED
);
97 DISALLOW_COPY_AND_ASSIGN(AllowAllPolicy
);
100 void TryVsyscallProcess(void) {
102 // time() is implemented as a vsyscall. With an older glibc, with
103 // vsyscall=emulate and some versions of the seccomp BPF patch
104 // we may get SIGKILL-ed. Detect this!
105 if (time(¤t_time
) != static_cast<time_t>(-1)) {
106 syscall(__NR_exit_group
, static_cast<intptr_t>(kExpectedExitCode
));
110 bool IsSingleThreaded(int proc_fd
) {
112 // Cannot determine whether program is single-threaded. Hope for
119 if ((task
= openat(proc_fd
, "self/task", O_RDONLY
| O_DIRECTORY
)) < 0 ||
120 fstat(task
, &sb
) != 0 || sb
.st_nlink
!= 3 || IGNORE_EINTR(close(task
))) {
122 if (IGNORE_EINTR(close(task
))) {
130 bool IsDenied(const ErrorCode
& code
) {
131 return (code
.err() & SECCOMP_RET_ACTION
) == SECCOMP_RET_TRAP
||
132 (code
.err() >= (SECCOMP_RET_ERRNO
+ ErrorCode::ERR_MIN_ERRNO
) &&
133 code
.err() <= (SECCOMP_RET_ERRNO
+ ErrorCode::ERR_MAX_ERRNO
));
136 // Function that can be passed as a callback function to CodeGen::Traverse().
137 // Checks whether the "insn" returns an UnsafeTrap() ErrorCode. If so, it
138 // sets the "bool" variable pointed to by "aux".
139 void CheckForUnsafeErrorCodes(Instruction
* insn
, void* aux
) {
140 bool* is_unsafe
= static_cast<bool*>(aux
);
142 if (BPF_CLASS(insn
->code
) == BPF_RET
&& insn
->k
> SECCOMP_RET_TRAP
&&
143 insn
->k
- SECCOMP_RET_TRAP
<= SECCOMP_RET_DATA
) {
144 const ErrorCode
& err
=
145 Trap::ErrorCodeFromTrapId(insn
->k
& SECCOMP_RET_DATA
);
146 if (err
.error_type() != ErrorCode::ET_INVALID
&& !err
.safe()) {
153 // A Trap() handler that returns an "errno" value. The value is encoded
154 // in the "aux" parameter.
155 intptr_t ReturnErrno(const struct arch_seccomp_data
&, void* aux
) {
156 // TrapFnc functions report error by following the native kernel convention
157 // of returning an exit code in the range of -1..-4096. They do not try to
158 // set errno themselves. The glibc wrapper that triggered the SIGSYS will
159 // ultimately do so for us.
160 int err
= reinterpret_cast<intptr_t>(aux
) & SECCOMP_RET_DATA
;
164 // Function that can be passed as a callback function to CodeGen::Traverse().
165 // Checks whether the "insn" returns an errno value from a BPF filter. If so,
166 // it rewrites the instruction to instead call a Trap() handler that does
167 // the same thing. "aux" is ignored.
168 void RedirectToUserspace(Instruction
* insn
, void* aux
) {
169 // When inside an UnsafeTrap() callback, we want to allow all system calls.
170 // This means, we must conditionally disable the sandbox -- and that's not
171 // something that kernel-side BPF filters can do, as they cannot inspect
172 // any state other than the syscall arguments.
173 // But if we redirect all error handlers to user-space, then we can easily
174 // make this decision.
175 // The performance penalty for this extra round-trip to user-space is not
176 // actually that bad, as we only ever pay it for denied system calls; and a
177 // typical program has very few of these.
178 SandboxBPF
* sandbox
= static_cast<SandboxBPF
*>(aux
);
179 if (BPF_CLASS(insn
->code
) == BPF_RET
&&
180 (insn
->k
& SECCOMP_RET_ACTION
) == SECCOMP_RET_ERRNO
) {
181 insn
->k
= sandbox
->Trap(ReturnErrno
,
182 reinterpret_cast<void*>(insn
->k
& SECCOMP_RET_DATA
)).err();
186 // This wraps an existing policy and changes its behavior to match the changes
187 // made by RedirectToUserspace(). This is part of the framework that allows BPF
188 // evaluation in userland.
189 // TODO(markus): document the code inside better.
190 class RedirectToUserSpacePolicyWrapper
: public SandboxBPFPolicy
{
192 explicit RedirectToUserSpacePolicyWrapper(
193 const SandboxBPFPolicy
* wrapped_policy
)
194 : wrapped_policy_(wrapped_policy
) {
195 DCHECK(wrapped_policy_
);
198 virtual ErrorCode
EvaluateSyscall(SandboxBPF
* sandbox_compiler
,
199 int system_call_number
) const OVERRIDE
{
201 wrapped_policy_
->EvaluateSyscall(sandbox_compiler
, system_call_number
);
202 if ((err
.err() & SECCOMP_RET_ACTION
) == SECCOMP_RET_ERRNO
) {
203 return ReturnErrnoViaTrap(sandbox_compiler
, err
.err() & SECCOMP_RET_DATA
);
208 virtual ErrorCode
InvalidSyscall(
209 SandboxBPF
* sandbox_compiler
) const OVERRIDE
{
210 return ReturnErrnoViaTrap(sandbox_compiler
, ENOSYS
);
214 ErrorCode
ReturnErrnoViaTrap(SandboxBPF
* sandbox_compiler
, int err
) const {
215 return sandbox_compiler
->Trap(ReturnErrno
, reinterpret_cast<void*>(err
));
218 const SandboxBPFPolicy
* wrapped_policy_
;
219 DISALLOW_COPY_AND_ASSIGN(RedirectToUserSpacePolicyWrapper
);
222 intptr_t BPFFailure(const struct arch_seccomp_data
&, void* aux
) {
223 SANDBOX_DIE(static_cast<char*>(aux
));
228 SandboxBPF::SandboxBPF()
232 sandbox_has_started_(false) {}
234 SandboxBPF::~SandboxBPF() {
235 // It is generally unsafe to call any memory allocator operations or to even
236 // call arbitrary destructors after having installed a new policy. We just
237 // have no way to tell whether this policy would allow the system calls that
238 // the constructors can trigger.
239 // So, we normally destroy all of our complex state prior to starting the
240 // sandbox. But this won't happen, if the Sandbox object was created and
241 // never actually used to set up a sandbox. So, just in case, we are
242 // destroying any remaining state.
243 // The "if ()" statements are technically superfluous. But let's be explicit
244 // that we really don't want to run any code, when we already destroyed
245 // objects before setting up the sandbox.
251 bool SandboxBPF::IsValidSyscallNumber(int sysnum
) {
252 return SyscallIterator::IsValid(sysnum
);
255 bool SandboxBPF::RunFunctionInPolicy(void (*code_in_sandbox
)(),
256 scoped_ptr
<SandboxBPFPolicy
> policy
) {
257 // Block all signals before forking a child process. This prevents an
258 // attacker from manipulating our test by sending us an unexpected signal.
259 sigset_t old_mask
, new_mask
;
260 if (sigfillset(&new_mask
) || sigprocmask(SIG_BLOCK
, &new_mask
, &old_mask
)) {
261 SANDBOX_DIE("sigprocmask() failed");
264 if (pipe2(fds
, O_NONBLOCK
| O_CLOEXEC
)) {
265 SANDBOX_DIE("pipe() failed");
268 if (fds
[0] <= 2 || fds
[1] <= 2) {
269 SANDBOX_DIE("Process started without standard file descriptors");
272 // This code is using fork() and should only ever run single-threaded.
273 // Most of the code below is "async-signal-safe" and only minor changes
274 // would be needed to support threads.
275 DCHECK(IsSingleThreaded(proc_fd_
));
278 // Die if we cannot fork(). We would probably fail a little later
279 // anyway, as the machine is likely very close to running out of
281 // But what we don't want to do is return "false", as a crafty
282 // attacker might cause fork() to fail at will and could trick us
283 // into running without a sandbox.
284 sigprocmask(SIG_SETMASK
, &old_mask
, NULL
); // OK, if it fails
285 SANDBOX_DIE("fork() failed unexpectedly");
288 // In the child process
290 // Test a very simple sandbox policy to verify that we can
291 // successfully turn on sandboxing.
292 Die::EnableSimpleExit();
295 if (IGNORE_EINTR(close(fds
[0]))) {
296 // This call to close() has been failing in strange ways. See
297 // crbug.com/152530. So we only fail in debug mode now.
299 WriteFailedStderrSetupMessage(fds
[1]);
303 if (HANDLE_EINTR(dup2(fds
[1], 2)) != 2) {
304 // Stderr could very well be a file descriptor to .xsession-errors, or
305 // another file, which could be backed by a file system that could cause
306 // dup2 to fail while trying to close stderr. It's important that we do
307 // not fail on trying to close stderr.
308 // If dup2 fails here, we will continue normally, this means that our
309 // parent won't cause a fatal failure if something writes to stderr in
312 // In DEBUG builds, we still want to get a report.
313 WriteFailedStderrSetupMessage(fds
[1]);
317 if (IGNORE_EINTR(close(fds
[1]))) {
318 // This call to close() has been failing in strange ways. See
319 // crbug.com/152530. So we only fail in debug mode now.
321 WriteFailedStderrSetupMessage(fds
[1]);
326 SetSandboxPolicy(policy
.release());
327 if (!StartSandbox(PROCESS_SINGLE_THREADED
)) {
331 // Run our code in the sandbox.
334 // code_in_sandbox() is not supposed to return here.
338 // In the parent process.
339 if (IGNORE_EINTR(close(fds
[1]))) {
340 SANDBOX_DIE("close() failed");
342 if (sigprocmask(SIG_SETMASK
, &old_mask
, NULL
)) {
343 SANDBOX_DIE("sigprocmask() failed");
346 if (HANDLE_EINTR(waitpid(pid
, &status
, 0)) != pid
) {
347 SANDBOX_DIE("waitpid() failed unexpectedly");
349 bool rc
= WIFEXITED(status
) && WEXITSTATUS(status
) == kExpectedExitCode
;
351 // If we fail to support sandboxing, there might be an additional
352 // error message. If so, this was an entirely unexpected and fatal
353 // failure. We should report the failure and somebody must fix
354 // things. This is probably a security-critical bug in the sandboxing
358 ssize_t len
= HANDLE_EINTR(read(fds
[0], buf
, sizeof(buf
) - 1));
360 while (len
> 1 && buf
[len
- 1] == '\n') {
367 if (IGNORE_EINTR(close(fds
[0]))) {
368 SANDBOX_DIE("close() failed");
374 bool SandboxBPF::KernelSupportSeccompBPF() {
375 return RunFunctionInPolicy(ProbeProcess
,
376 scoped_ptr
<SandboxBPFPolicy
>(new ProbePolicy())) &&
379 scoped_ptr
<SandboxBPFPolicy
>(new AllowAllPolicy()));
383 SandboxBPF::SandboxStatus
SandboxBPF::SupportsSeccompSandbox(int proc_fd
) {
384 // It the sandbox is currently active, we clearly must have support for
386 if (status_
== STATUS_ENABLED
) {
390 // Even if the sandbox was previously available, something might have
391 // changed in our run-time environment. Check one more time.
392 if (status_
== STATUS_AVAILABLE
) {
393 if (!IsSingleThreaded(proc_fd
)) {
394 status_
= STATUS_UNAVAILABLE
;
399 if (status_
== STATUS_UNAVAILABLE
&& IsSingleThreaded(proc_fd
)) {
400 // All state transitions resulting in STATUS_UNAVAILABLE are immediately
401 // preceded by STATUS_AVAILABLE. Furthermore, these transitions all
402 // happen, if and only if they are triggered by the process being multi-
404 // In other words, if a single-threaded process is currently in the
405 // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is
406 // actually available.
407 status_
= STATUS_AVAILABLE
;
411 // If we have not previously checked for availability of the sandbox or if
412 // we otherwise don't believe to have a good cached value, we have to
413 // perform a thorough check now.
414 if (status_
== STATUS_UNKNOWN
) {
415 // We create our own private copy of a "Sandbox" object. This ensures that
416 // the object does not have any policies configured, that might interfere
417 // with the tests done by "KernelSupportSeccompBPF()".
420 // By setting "quiet_ = true" we suppress messages for expected and benign
421 // failures (e.g. if the current kernel lacks support for BPF filters).
422 sandbox
.quiet_
= true;
423 sandbox
.set_proc_fd(proc_fd
);
424 status_
= sandbox
.KernelSupportSeccompBPF() ? STATUS_AVAILABLE
425 : STATUS_UNSUPPORTED
;
427 // As we are performing our tests from a child process, the run-time
428 // environment that is visible to the sandbox is always guaranteed to be
429 // single-threaded. Let's check here whether the caller is single-
430 // threaded. Otherwise, we mark the sandbox as temporarily unavailable.
431 if (status_
== STATUS_AVAILABLE
&& !IsSingleThreaded(proc_fd
)) {
432 status_
= STATUS_UNAVAILABLE
;
439 SandboxBPF::SandboxStatus
440 SandboxBPF::SupportsSeccompThreadFilterSynchronization() {
441 // Applying NO_NEW_PRIVS, a BPF filter, and synchronizing the filter across
442 // the thread group are all handled atomically by this syscall.
443 int rv
= syscall(__NR_seccomp
);
445 // The system call should have failed with EINVAL.
448 return STATUS_UNKNOWN
;
451 if (errno
== EINVAL
|| errno
== EFAULT
)
452 return STATUS_AVAILABLE
;
454 // errno is probably ENOSYS, indicating the system call is not available.
455 DCHECK_EQ(errno
, ENOSYS
);
456 return STATUS_UNSUPPORTED
;
459 void SandboxBPF::set_proc_fd(int proc_fd
) { proc_fd_
= proc_fd
; }
461 bool SandboxBPF::StartSandbox(SandboxThreadState thread_state
) {
462 CHECK(thread_state
== PROCESS_SINGLE_THREADED
||
463 thread_state
== PROCESS_MULTI_THREADED
);
465 if (status_
== STATUS_UNSUPPORTED
|| status_
== STATUS_UNAVAILABLE
) {
467 "Trying to start sandbox, even though it is known to be "
470 } else if (sandbox_has_started_
|| !conds_
) {
472 "Cannot repeatedly start sandbox. Create a separate Sandbox "
477 proc_fd_
= open("/proc", O_RDONLY
| O_DIRECTORY
);
480 // For now, continue in degraded mode, if we can't access /proc.
481 // In the future, we might want to tighten this requirement.
484 bool supports_tsync
=
485 SupportsSeccompThreadFilterSynchronization() == STATUS_AVAILABLE
;
487 if (thread_state
== PROCESS_SINGLE_THREADED
) {
488 if (!IsSingleThreaded(proc_fd_
)) {
489 SANDBOX_DIE("Cannot start sandbox; process is already multi-threaded");
492 } else if (thread_state
== PROCESS_MULTI_THREADED
) {
493 if (IsSingleThreaded(proc_fd_
)) {
494 SANDBOX_DIE("Cannot start sandbox; "
495 "process may be single-threaded when reported as not");
498 if (!supports_tsync
) {
499 SANDBOX_DIE("Cannot start sandbox; kernel does not support synchronizing "
500 "filters for a threadgroup");
505 // We no longer need access to any files in /proc. We want to do this
506 // before installing the filters, just in case that our policy denies
509 if (IGNORE_EINTR(close(proc_fd_
))) {
510 SANDBOX_DIE("Failed to close file descriptor for /proc");
516 // Install the filters.
517 InstallFilter(supports_tsync
|| thread_state
== PROCESS_MULTI_THREADED
);
519 // We are now inside the sandbox.
520 status_
= STATUS_ENABLED
;
525 void SandboxBPF::PolicySanityChecks(SandboxBPFPolicy
* policy
) {
526 if (!IsDenied(policy
->InvalidSyscall(this))) {
527 SANDBOX_DIE("Policies should deny invalid system calls.");
532 // Don't take a scoped_ptr here, polymorphism make their use awkward.
533 void SandboxBPF::SetSandboxPolicy(SandboxBPFPolicy
* policy
) {
535 if (sandbox_has_started_
|| !conds_
) {
536 SANDBOX_DIE("Cannot change policy after sandbox has started");
538 PolicySanityChecks(policy
);
539 policy_
.reset(policy
);
542 void SandboxBPF::InstallFilter(bool must_sync_threads
) {
543 // We want to be very careful in not imposing any requirements on the
544 // policies that are set with SetSandboxPolicy(). This means, as soon as
545 // the sandbox is active, we shouldn't be relying on libraries that could
546 // be making system calls. This, for example, means we should avoid
547 // using the heap and we should avoid using STL functions.
548 // Temporarily copy the contents of the "program" vector into a
549 // stack-allocated array; and then explicitly destroy that object.
550 // This makes sure we don't ex- or implicitly call new/delete after we
551 // installed the BPF filter program in the kernel. Depending on the
552 // system memory allocator that is in effect, these operators can result
553 // in system calls to things like munmap() or brk().
554 Program
* program
= AssembleFilter(false /* force_verification */);
556 struct sock_filter bpf
[program
->size()];
557 const struct sock_fprog prog
= {static_cast<unsigned short>(program
->size()),
559 memcpy(bpf
, &(*program
)[0], sizeof(bpf
));
562 // Make an attempt to release memory that is no longer needed here, rather
563 // than in the destructor. Try to avoid as much as possible to presume of
564 // what will be possible to do in the new (sandboxed) execution environment.
569 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0)) {
570 SANDBOX_DIE(quiet_
? NULL
: "Kernel refuses to enable no-new-privs");
573 // Install BPF filter program. If the thread state indicates multi-threading
574 // support, then the kernel hass the seccomp system call. Otherwise, fall
575 // back on prctl, which requires the process to be single-threaded.
576 if (must_sync_threads
) {
577 int rv
= syscall(__NR_seccomp
, SECCOMP_SET_MODE_FILTER
,
578 SECCOMP_FILTER_FLAG_TSYNC
, reinterpret_cast<const char*>(&prog
));
580 SANDBOX_DIE(quiet_
? NULL
:
581 "Kernel refuses to turn on and synchronize threads for BPF filters");
584 if (prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, &prog
)) {
585 SANDBOX_DIE(quiet_
? NULL
: "Kernel refuses to turn on BPF filters");
589 sandbox_has_started_
= true;
592 SandboxBPF::Program
* SandboxBPF::AssembleFilter(bool force_verification
) {
594 force_verification
= true;
597 // Verify that the user pushed a policy.
600 // Assemble the BPF filter program.
601 CodeGen
* gen
= new CodeGen();
603 SANDBOX_DIE("Out of memory");
606 // If the architecture doesn't match SECCOMP_ARCH, disallow the
609 Instruction
* head
= gen
->MakeInstruction(
610 BPF_LD
+ BPF_W
+ BPF_ABS
,
612 tail
= gen
->MakeInstruction(
613 BPF_JMP
+ BPF_JEQ
+ BPF_K
,
616 gen
->MakeInstruction(
618 Kill("Invalid audit architecture in BPF filter"))));
620 bool has_unsafe_traps
= false;
622 // Evaluate all possible system calls and group their ErrorCodes into
623 // ranges of identical codes.
627 // Compile the system call ranges to an optimized BPF jumptable
628 Instruction
* jumptable
=
629 AssembleJumpTable(gen
, ranges
.begin(), ranges
.end());
631 // If there is at least one UnsafeTrap() in our program, the entire sandbox
632 // is unsafe. We need to modify the program so that all non-
633 // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then
634 // allow us to temporarily disable sandboxing rules inside of callbacks to
636 gen
->Traverse(jumptable
, CheckForUnsafeErrorCodes
, &has_unsafe_traps
);
638 // Grab the system call number, so that we can implement jump tables.
639 Instruction
* load_nr
=
640 gen
->MakeInstruction(BPF_LD
+ BPF_W
+ BPF_ABS
, SECCOMP_NR_IDX
);
642 // If our BPF program has unsafe jumps, enable support for them. This
643 // test happens very early in the BPF filter program. Even before we
644 // consider looking at system call numbers.
645 // As support for unsafe jumps essentially defeats all the security
646 // measures that the sandbox provides, we print a big warning message --
647 // and of course, we make sure to only ever enable this feature if it
648 // is actually requested by the sandbox policy.
649 if (has_unsafe_traps
) {
650 if (Syscall::Call(-1) == -1 && errno
== ENOSYS
) {
652 "Support for UnsafeTrap() has not yet been ported to this "
656 if (!policy_
->EvaluateSyscall(this, __NR_rt_sigprocmask
)
657 .Equals(ErrorCode(ErrorCode::ERR_ALLOWED
)) ||
658 !policy_
->EvaluateSyscall(this, __NR_rt_sigreturn
)
659 .Equals(ErrorCode(ErrorCode::ERR_ALLOWED
))
660 #if defined(__NR_sigprocmask)
662 !policy_
->EvaluateSyscall(this, __NR_sigprocmask
)
663 .Equals(ErrorCode(ErrorCode::ERR_ALLOWED
))
665 #if defined(__NR_sigreturn)
667 !policy_
->EvaluateSyscall(this, __NR_sigreturn
)
668 .Equals(ErrorCode(ErrorCode::ERR_ALLOWED
))
672 "Invalid seccomp policy; if using UnsafeTrap(), you must "
673 "unconditionally allow sigreturn() and sigprocmask()");
676 if (!Trap::EnableUnsafeTrapsInSigSysHandler()) {
677 // We should never be able to get here, as UnsafeTrap() should never
678 // actually return a valid ErrorCode object unless the user set the
679 // CHROME_SANDBOX_DEBUGGING environment variable; and therefore,
680 // "has_unsafe_traps" would always be false. But better double-check
681 // than enabling dangerous code.
682 SANDBOX_DIE("We'd rather die than enable unsafe traps");
684 gen
->Traverse(jumptable
, RedirectToUserspace
, this);
686 // Allow system calls, if they originate from our magic return address
687 // (which we can query by calling Syscall::Call(-1)).
688 uintptr_t syscall_entry_point
= static_cast<uintptr_t>(Syscall::Call(-1));
689 uint32_t low
= static_cast<uint32_t>(syscall_entry_point
);
690 #if __SIZEOF_POINTER__ > 4
691 uint32_t hi
= static_cast<uint32_t>(syscall_entry_point
>> 32);
694 // BPF cannot do native 64bit comparisons. On 64bit architectures, we
695 // have to compare both 32bit halves of the instruction pointer. If they
696 // match what we expect, we return ERR_ALLOWED. If either or both don't
697 // match, we continue evalutating the rest of the sandbox policy.
698 Instruction
* escape_hatch
= gen
->MakeInstruction(
699 BPF_LD
+ BPF_W
+ BPF_ABS
,
701 gen
->MakeInstruction(
702 BPF_JMP
+ BPF_JEQ
+ BPF_K
,
704 #if __SIZEOF_POINTER__ > 4
705 gen
->MakeInstruction(
706 BPF_LD
+ BPF_W
+ BPF_ABS
,
708 gen
->MakeInstruction(
709 BPF_JMP
+ BPF_JEQ
+ BPF_K
,
712 gen
->MakeInstruction(BPF_RET
+ BPF_K
,
713 ErrorCode(ErrorCode::ERR_ALLOWED
)),
714 #if __SIZEOF_POINTER__ > 4
718 gen
->JoinInstructions(tail
, escape_hatch
);
720 gen
->JoinInstructions(tail
, load_nr
);
724 // On Intel architectures, verify that system call numbers are in the
725 // expected number range. The older i386 and x86-64 APIs clear bit 30
726 // on all system calls. The newer x32 API always sets bit 30.
727 #if defined(__i386__) || defined(__x86_64__)
728 Instruction
* invalidX32
= gen
->MakeInstruction(
729 BPF_RET
+ BPF_K
, Kill("Illegal mixing of system call ABIs").err_
);
730 Instruction
* checkX32
=
731 #if defined(__x86_64__) && defined(__ILP32__)
732 gen
->MakeInstruction(
733 BPF_JMP
+ BPF_JSET
+ BPF_K
, 0x40000000, 0, invalidX32
);
735 gen
->MakeInstruction(
736 BPF_JMP
+ BPF_JSET
+ BPF_K
, 0x40000000, invalidX32
, 0);
738 gen
->JoinInstructions(tail
, checkX32
);
742 // Append jump table to our pre-amble
743 gen
->JoinInstructions(tail
, jumptable
);
746 // Turn the DAG into a vector of instructions.
747 Program
* program
= new Program();
748 gen
->Compile(head
, program
);
751 // Make sure compilation resulted in BPF program that executes
752 // correctly. Otherwise, there is an internal error in our BPF compiler.
753 // There is really nothing the caller can do until the bug is fixed.
754 if (force_verification
) {
755 // Verification is expensive. We only perform this step, if we are
756 // compiled in debug mode, or if the caller explicitly requested
758 VerifyProgram(*program
, has_unsafe_traps
);
764 void SandboxBPF::VerifyProgram(const Program
& program
, bool has_unsafe_traps
) {
765 // If we previously rewrote the BPF program so that it calls user-space
766 // whenever we return an "errno" value from the filter, then we have to
767 // wrap our system call evaluator to perform the same operation. Otherwise,
768 // the verifier would also report a mismatch in return codes.
769 scoped_ptr
<const RedirectToUserSpacePolicyWrapper
> redirected_policy(
770 new RedirectToUserSpacePolicyWrapper(policy_
.get()));
772 const char* err
= NULL
;
773 if (!Verifier::VerifyBPF(this,
775 has_unsafe_traps
? *redirected_policy
: *policy_
,
777 CodeGen::PrintProgram(program
);
782 void SandboxBPF::FindRanges(Ranges
* ranges
) {
783 // Please note that "struct seccomp_data" defines system calls as a signed
784 // int32_t, but BPF instructions always operate on unsigned quantities. We
785 // deal with this disparity by enumerating from MIN_SYSCALL to MAX_SYSCALL,
786 // and then verifying that the rest of the number range (both positive and
787 // negative) all return the same ErrorCode.
788 const ErrorCode invalid_err
= policy_
->InvalidSyscall(this);
789 uint32_t old_sysnum
= 0;
790 ErrorCode old_err
= IsValidSyscallNumber(old_sysnum
)
791 ? policy_
->EvaluateSyscall(this, old_sysnum
)
794 for (SyscallIterator
iter(false); !iter
.Done();) {
795 uint32_t sysnum
= iter
.Next();
797 IsValidSyscallNumber(sysnum
)
798 ? policy_
->EvaluateSyscall(this, static_cast<int>(sysnum
))
800 if (!err
.Equals(old_err
) || iter
.Done()) {
801 ranges
->push_back(Range(old_sysnum
, sysnum
- 1, old_err
));
808 Instruction
* SandboxBPF::AssembleJumpTable(CodeGen
* gen
,
809 Ranges::const_iterator start
,
810 Ranges::const_iterator stop
) {
811 // We convert the list of system call ranges into jump table that performs
812 // a binary search over the ranges.
813 // As a sanity check, we need to have at least one distinct ranges for us
814 // to be able to build a jump table.
815 if (stop
- start
<= 0) {
816 SANDBOX_DIE("Invalid set of system call ranges");
817 } else if (stop
- start
== 1) {
818 // If we have narrowed things down to a single range object, we can
819 // return from the BPF filter program.
820 return RetExpression(gen
, start
->err
);
823 // Pick the range object that is located at the mid point of our list.
824 // We compare our system call number against the lowest valid system call
825 // number in this range object. If our number is lower, it is outside of
826 // this range object. If it is greater or equal, it might be inside.
827 Ranges::const_iterator mid
= start
+ (stop
- start
) / 2;
829 // Sub-divide the list of ranges and continue recursively.
830 Instruction
* jf
= AssembleJumpTable(gen
, start
, mid
);
831 Instruction
* jt
= AssembleJumpTable(gen
, mid
, stop
);
832 return gen
->MakeInstruction(BPF_JMP
+ BPF_JGE
+ BPF_K
, mid
->from
, jt
, jf
);
835 Instruction
* SandboxBPF::RetExpression(CodeGen
* gen
, const ErrorCode
& err
) {
836 if (err
.error_type_
== ErrorCode::ET_COND
) {
837 return CondExpression(gen
, err
);
839 return gen
->MakeInstruction(BPF_RET
+ BPF_K
, err
);
843 Instruction
* SandboxBPF::CondExpression(CodeGen
* gen
, const ErrorCode
& cond
) {
844 // We can only inspect the six system call arguments that are passed in
846 if (cond
.argno_
< 0 || cond
.argno_
>= 6) {
848 "Internal compiler error; invalid argument number "
852 // BPF programs operate on 32bit entities. Load both halfs of the 64bit
853 // system call argument and then generate suitable conditional statements.
854 Instruction
* msb_head
= gen
->MakeInstruction(
855 BPF_LD
+ BPF_W
+ BPF_ABS
, SECCOMP_ARG_MSB_IDX(cond
.argno_
));
856 Instruction
* msb_tail
= msb_head
;
857 Instruction
* lsb_head
= gen
->MakeInstruction(
858 BPF_LD
+ BPF_W
+ BPF_ABS
, SECCOMP_ARG_LSB_IDX(cond
.argno_
));
859 Instruction
* lsb_tail
= lsb_head
;
861 // Emit a suitable comparison statement.
863 case ErrorCode::OP_EQUAL
:
864 // Compare the least significant bits for equality
865 lsb_tail
= gen
->MakeInstruction(BPF_JMP
+ BPF_JEQ
+ BPF_K
,
866 static_cast<uint32_t>(cond
.value_
),
867 RetExpression(gen
, *cond
.passed_
),
868 RetExpression(gen
, *cond
.failed_
));
869 gen
->JoinInstructions(lsb_head
, lsb_tail
);
871 // If we are looking at a 64bit argument, we need to also compare the
872 // most significant bits.
873 if (cond
.width_
== ErrorCode::TP_64BIT
) {
875 gen
->MakeInstruction(BPF_JMP
+ BPF_JEQ
+ BPF_K
,
876 static_cast<uint32_t>(cond
.value_
>> 32),
878 RetExpression(gen
, *cond
.failed_
));
879 gen
->JoinInstructions(msb_head
, msb_tail
);
882 case ErrorCode::OP_HAS_ALL_BITS
:
883 // Check the bits in the LSB half of the system call argument. Our
884 // OP_HAS_ALL_BITS operator passes, iff all of the bits are set. This is
885 // different from the kernel's BPF_JSET operation which passes, if any of
887 // Of course, if there is only a single set bit (or none at all), then
888 // things get easier.
890 uint32_t lsb_bits
= static_cast<uint32_t>(cond
.value_
);
891 int lsb_bit_count
= popcount(lsb_bits
);
892 if (lsb_bit_count
== 0) {
893 // No bits are set in the LSB half. The test will always pass.
894 lsb_head
= RetExpression(gen
, *cond
.passed_
);
896 } else if (lsb_bit_count
== 1) {
897 // Exactly one bit is set in the LSB half. We can use the BPF_JSET
899 lsb_tail
= gen
->MakeInstruction(BPF_JMP
+ BPF_JSET
+ BPF_K
,
901 RetExpression(gen
, *cond
.passed_
),
902 RetExpression(gen
, *cond
.failed_
));
903 gen
->JoinInstructions(lsb_head
, lsb_tail
);
905 // More than one bit is set in the LSB half. We need to combine
906 // BPF_AND and BPF_JEQ to test whether all of these bits are in fact
907 // set in the system call argument.
908 gen
->JoinInstructions(
910 gen
->MakeInstruction(BPF_ALU
+ BPF_AND
+ BPF_K
,
912 lsb_tail
= gen
->MakeInstruction(
913 BPF_JMP
+ BPF_JEQ
+ BPF_K
,
915 RetExpression(gen
, *cond
.passed_
),
916 RetExpression(gen
, *cond
.failed_
))));
920 // If we are looking at a 64bit argument, we need to also check the bits
921 // in the MSB half of the system call argument.
922 if (cond
.width_
== ErrorCode::TP_64BIT
) {
923 uint32_t msb_bits
= static_cast<uint32_t>(cond
.value_
>> 32);
924 int msb_bit_count
= popcount(msb_bits
);
925 if (msb_bit_count
== 0) {
926 // No bits are set in the MSB half. The test will always pass.
928 } else if (msb_bit_count
== 1) {
929 // Exactly one bit is set in the MSB half. We can use the BPF_JSET
931 msb_tail
= gen
->MakeInstruction(BPF_JMP
+ BPF_JSET
+ BPF_K
,
934 RetExpression(gen
, *cond
.failed_
));
935 gen
->JoinInstructions(msb_head
, msb_tail
);
937 // More than one bit is set in the MSB half. We need to combine
938 // BPF_AND and BPF_JEQ to test whether all of these bits are in fact
939 // set in the system call argument.
940 gen
->JoinInstructions(
942 gen
->MakeInstruction(
943 BPF_ALU
+ BPF_AND
+ BPF_K
,
945 gen
->MakeInstruction(BPF_JMP
+ BPF_JEQ
+ BPF_K
,
948 RetExpression(gen
, *cond
.failed_
))));
952 case ErrorCode::OP_HAS_ANY_BITS
:
953 // Check the bits in the LSB half of the system call argument. Our
954 // OP_HAS_ANY_BITS operator passes, iff any of the bits are set. This maps
955 // nicely to the kernel's BPF_JSET operation.
957 uint32_t lsb_bits
= static_cast<uint32_t>(cond
.value_
);
959 // No bits are set in the LSB half. The test will always fail.
960 lsb_head
= RetExpression(gen
, *cond
.failed_
);
963 lsb_tail
= gen
->MakeInstruction(BPF_JMP
+ BPF_JSET
+ BPF_K
,
965 RetExpression(gen
, *cond
.passed_
),
966 RetExpression(gen
, *cond
.failed_
));
967 gen
->JoinInstructions(lsb_head
, lsb_tail
);
971 // If we are looking at a 64bit argument, we need to also check the bits
972 // in the MSB half of the system call argument.
973 if (cond
.width_
== ErrorCode::TP_64BIT
) {
974 uint32_t msb_bits
= static_cast<uint32_t>(cond
.value_
>> 32);
976 // No bits are set in the MSB half. The test will always fail.
979 msb_tail
= gen
->MakeInstruction(BPF_JMP
+ BPF_JSET
+ BPF_K
,
981 RetExpression(gen
, *cond
.passed_
),
983 gen
->JoinInstructions(msb_head
, msb_tail
);
988 // TODO(markus): Need to add support for OP_GREATER
989 SANDBOX_DIE("Not implemented");
993 // Ensure that we never pass a 64bit value, when we only expect a 32bit
994 // value. This is somewhat complicated by the fact that on 64bit systems,
995 // callers could legitimately pass in a non-zero value in the MSB, iff the
996 // LSB has been sign-extended into the MSB.
997 if (cond
.width_
== ErrorCode::TP_32BIT
) {
998 if (cond
.value_
>> 32) {
1000 "Invalid comparison of a 32bit system call argument "
1001 "against a 64bit constant; this test is always false.");
1004 Instruction
* invalid_64bit
= RetExpression(gen
, Unexpected64bitArgument());
1005 #if __SIZEOF_POINTER__ > 4
1006 invalid_64bit
= gen
->MakeInstruction(
1007 BPF_JMP
+ BPF_JEQ
+ BPF_K
,
1009 gen
->MakeInstruction(BPF_LD
+ BPF_W
+ BPF_ABS
,
1010 SECCOMP_ARG_LSB_IDX(cond
.argno_
),
1011 gen
->MakeInstruction(BPF_JMP
+ BPF_JGE
+ BPF_K
,
1017 gen
->JoinInstructions(
1019 gen
->MakeInstruction(
1020 BPF_JMP
+ BPF_JEQ
+ BPF_K
, 0, lsb_head
, invalid_64bit
));
1026 ErrorCode
SandboxBPF::Unexpected64bitArgument() {
1027 return Kill("Unexpected 64bit argument detected");
1030 ErrorCode
SandboxBPF::Trap(Trap::TrapFnc fnc
, const void* aux
) {
1031 return Trap::MakeTrap(fnc
, aux
, true /* Safe Trap */);
1034 ErrorCode
SandboxBPF::UnsafeTrap(Trap::TrapFnc fnc
, const void* aux
) {
1035 return Trap::MakeTrap(fnc
, aux
, false /* Unsafe Trap */);
1038 intptr_t SandboxBPF::ForwardSyscall(const struct arch_seccomp_data
& args
) {
1039 return Syscall::Call(args
.nr
,
1040 static_cast<intptr_t>(args
.args
[0]),
1041 static_cast<intptr_t>(args
.args
[1]),
1042 static_cast<intptr_t>(args
.args
[2]),
1043 static_cast<intptr_t>(args
.args
[3]),
1044 static_cast<intptr_t>(args
.args
[4]),
1045 static_cast<intptr_t>(args
.args
[5]));
1048 ErrorCode
SandboxBPF::Cond(int argno
,
1049 ErrorCode::ArgType width
,
1050 ErrorCode::Operation op
,
1052 const ErrorCode
& passed
,
1053 const ErrorCode
& failed
) {
1054 return ErrorCode(argno
,
1058 &*conds_
->insert(passed
).first
,
1059 &*conds_
->insert(failed
).first
);
1062 ErrorCode
SandboxBPF::Kill(const char* msg
) {
1063 return Trap(BPFFailure
, const_cast<char*>(msg
));
1066 SandboxBPF::SandboxStatus
SandboxBPF::status_
= STATUS_UNKNOWN
;
1068 } // namespace sandbox