1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef SANDBOX_BPF_H__
6 #define SANDBOX_BPF_H__
11 #include <linux/audit.h>
12 #include <linux/filter.h>
13 // #include <linux/seccomp.h>
14 #include <linux/unistd.h>
15 #include <netinet/in.h>
16 #include <netinet/tcp.h>
17 #include <netinet/udp.h>
25 #include <sys/ioctl.h>
28 #include <sys/prctl.h>
31 #include <sys/types.h>
42 #ifndef SECCOMP_BPF_STANDALONE
43 #include "base/basictypes.h"
44 #include "base/eintr_wrapper.h"
45 #include "base/logging.h"
48 // The Seccomp2 kernel ABI is not part of older versions of glibc.
49 // As we can't break compilation with these versions of the library,
50 // we explicitly define all missing symbols.
52 #ifndef PR_SET_NO_NEW_PRIVS
53 #define PR_SET_NO_NEW_PRIVS 38
54 #define PR_GET_NO_NEW_PRIVS 39
59 #ifndef SECCOMP_MODE_FILTER
60 #define SECCOMP_MODE_DISABLED 0
61 #define SECCOMP_MODE_STRICT 1
62 #define SECCOMP_MODE_FILTER 2 // User user-supplied filter
63 #define SECCOMP_RET_KILL 0x00000000U // Kill the task immediately
64 #define SECCOMP_RET_TRAP 0x00030000U // Disallow and force a SIGSYS
65 #define SECCOMP_RET_ERRNO 0x00050000U // Returns an errno
66 #define SECCOMP_RET_TRACE 0x7ff00000U // Pass to a tracer or disallow
67 #define SECCOMP_RET_ALLOW 0x7fff0000U // Allow
68 #define SECCOMP_RET_INVALID 0x8f8f8f8fU // Illegal return value
69 #define SECCOMP_RET_ACTION 0xffff0000U // Masks for the return value
70 #define SECCOMP_RET_DATA 0x0000ffffU // sections
72 #define SECCOMP_DENY_ERRNO EPERM
77 // Impose some reasonable maximum BPF program size. Realistically, the
78 // kernel probably has much lower limits. But by limiting to less than
79 // 30 bits, we can ease requirements on some of our data types.
80 #define SECCOMP_MAX_PROGRAM_SIZE (1<<30)
83 #define MIN_SYSCALL 0u
84 #define MAX_SYSCALL 1024u
85 #define SECCOMP_ARCH AUDIT_ARCH_I386
87 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
88 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_EAX)
89 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_EAX)
90 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_EIP)
91 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_EBX)
92 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_ECX)
93 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_EDX)
94 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_ESI)
95 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_EDI)
96 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_EBP)
98 #elif defined(__x86_64__)
99 #define MIN_SYSCALL 0u
100 #define MAX_SYSCALL 1024u
101 #define SECCOMP_ARCH AUDIT_ARCH_X86_64
103 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
104 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_RAX)
105 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_RAX)
106 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_RIP)
107 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_RDI)
108 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_RSI)
109 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_RDX)
110 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_R10)
111 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_R8)
112 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_R9)
114 #elif defined(__arm__) && (defined(__thumb__) || defined(__ARM_EABI__))
115 // ARM EABI includes "ARM private" system calls starting at |__ARM_NR_BASE|,
116 // and a "ghost syscall private to the kernel", cmpxchg,
117 // at |__ARM_NR_BASE+0x00fff0|.
118 // See </arch/arm/include/asm/unistd.h> in the Linux kernel.
119 #define MIN_SYSCALL ((unsigned int)__NR_SYSCALL_BASE)
120 #define MAX_SYSCALL ((unsigned int)__ARM_NR_BASE + 0x00ffffu)
121 // <linux/audit.h> includes <linux/elf-em.h>, which does not define EM_ARM.
122 // <linux/elf.h> only includes <asm/elf.h> if we're in the kernel.
123 # if !defined(EM_ARM)
126 #define SECCOMP_ARCH AUDIT_ARCH_ARM
128 // ARM sigcontext_t is different from i386/x86_64.
129 // See </arch/arm/include/asm/sigcontext.h> in the Linux kernel.
130 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.arm_##_reg)
131 // ARM EABI syscall convention.
132 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, r0)
133 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, r7)
134 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, pc)
135 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, r0)
136 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, r1)
137 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, r2)
138 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, r3)
139 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, r4)
140 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, r5)
143 #error Unsupported target platform
147 struct arch_seccomp_data
{
150 uint64_t instruction_pointer
;
160 #ifdef SECCOMP_BPF_STANDALONE
161 #define arraysize(x) sizeof(x)/sizeof(*(x)))
162 #define HANDLE_EINTR TEMP_FAILURE_RETRY
163 #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
165 TypeName(const TypeName&); \
166 void operator=(const TypeName&)
170 namespace playground2
{
175 STATUS_UNKNOWN
, // Status prior to calling supportsSeccompSandbox()
176 STATUS_UNSUPPORTED
, // The kernel does not appear to support sandboxing
177 STATUS_UNAVAILABLE
, // Currently unavailable but might work again later
178 STATUS_AVAILABLE
, // Sandboxing is available but not currently active
179 STATUS_ENABLED
// The sandbox is now active
185 SB_INSPECT_ARG_1
= 0x8001,
186 SB_INSPECT_ARG_2
= 0x8002,
187 SB_INSPECT_ARG_3
= 0x8004,
188 SB_INSPECT_ARG_4
= 0x8008,
189 SB_INSPECT_ARG_5
= 0x8010,
190 SB_INSPECT_ARG_6
= 0x8020
193 // TrapFnc is a pointer to a function that handles Seccomp traps in
194 // user-space. The seccomp policy can request that a trap handler gets
195 // installed; it does so by returning a suitable ErrorCode() from the
196 // syscallEvaluator. See the ErrorCode() constructor for how to pass in
197 // the function pointer.
198 // Please note that TrapFnc is executed from signal context and must be
199 // async-signal safe:
200 // http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html
201 typedef intptr_t (*TrapFnc
)(const struct arch_seccomp_data
& args
, void *aux
);
204 friend class Sandbox
;
206 // We can either wrap a symbolic ErrorCode (i.e. enum values), an errno
207 // value (in the range 1..4095), or a pointer to a TrapFnc callback
208 // handling a SECCOMP_RET_TRAP trap.
209 // All of these different values are stored in the "err_" field. So, code
210 // that is using the ErrorCode class typically operates on a single 32bit
212 // This is not only quiet efficient, it also makes the API really easy to
214 ErrorCode(int err
= SB_INVALID
)
220 err_
= SECCOMP_RET_INVALID
;
223 err_
= SECCOMP_RET_ALLOW
;
225 case SB_INSPECT_ARG_1
...SB_INSPECT_ARG_6
:
226 die("Not implemented");
229 err_
= SECCOMP_RET_ERRNO
+ err
;
232 die("Invalid use of ErrorCode object");
236 // If we are wrapping a callback, we must assign a unique id. This id is
237 // how the kernel tells us which one of our different SECCOMP_RET_TRAP
238 // cases has been triggered.
239 // The getTrapId() function assigns one unique id (starting at 1) for
240 // each distinct pair of TrapFnc and auxiliary data.
241 ErrorCode(TrapFnc fnc
, const void *aux
, int id
= 0) :
242 id_(id
? id
: getTrapId(fnc
, aux
)),
244 aux_(const_cast<void *>(aux
)),
245 err_(SECCOMP_RET_TRAP
+ id_
) {
248 // Destructor doesn't need to do anything.
251 // Always return the value that goes into the BPF filter program.
252 operator uint32_t() const { return err_
; }
255 // Fields needed for SECCOMP_RET_TRAP callbacks
260 // 32bit field used for all possible types of ErrorCode values
265 OP_NOP
, OP_EQUAL
, OP_NOTEQUAL
, OP_LESS
,
266 OP_LESS_EQUAL
, OP_GREATER
, OP_GREATER_EQUAL
,
267 OP_HAS_BITS
, OP_DOES_NOT_HAVE_BITS
278 typedef ErrorCode (*EvaluateSyscall
)(int sysno
);
279 typedef int (*EvaluateArguments
)(int sysno
, int arg
,
280 Constraint
*constraint
);
281 typedef std::vector
<std::pair
<EvaluateSyscall
,EvaluateArguments
> >Evaluators
;
283 // There are a lot of reasons why the Seccomp sandbox might not be available.
284 // This could be because the kernel does not support Seccomp mode, or it
285 // could be because another sandbox is already active.
286 // "proc_fd" should be a file descriptor for "/proc", or -1 if not
287 // provided by the caller.
288 static SandboxStatus
supportsSeccompSandbox(int proc_fd
);
290 // The sandbox needs to be able to access files in "/proc/self". If this
291 // directory is not accessible when "startSandbox()" gets called, the caller
292 // can provide an already opened file descriptor by calling "setProcFd()".
293 // The sandbox becomes the new owner of this file descriptor and will
294 // eventually close it when "startSandbox()" executes.
295 static void setProcFd(int proc_fd
);
297 // The system call evaluator function is called with the system
298 // call number. It can decide to allow the system call unconditionally
299 // by returning "0"; it can deny the system call unconditionally by
300 // returning an appropriate "errno" value; or it can request inspection
301 // of system call argument(s) by returning a suitable combination of
302 // SB_INSPECT_ARG_x bits.
303 // The system argument evaluator is called (if needed) to query additional
304 // constraints for the system call arguments. In the vast majority of
305 // cases, it will set a "Constraint" that forces a new "errno" value.
306 // But for more complex filters, it is possible to return another mask
307 // of SB_INSPECT_ARG_x bits.
308 static void setSandboxPolicy(EvaluateSyscall syscallEvaluator
,
309 EvaluateArguments argumentEvaluator
);
311 // This is the main public entry point. It finds all system calls that
312 // need rewriting, sets up the resources needed by the sandbox, and
313 // enters Seccomp mode.
314 static void startSandbox();
317 // Print an error message and terminate the program. Used for fatal errors.
318 static void die(const char *msg
) __attribute__((noreturn
)) {
320 #ifndef SECCOMP_BPF_STANDALONE
322 // LOG(FATAL) is not neccessarily async-signal safe. It would be
323 // better to always use the code for the SECCOMP_BPF_STANDALONE case.
324 // But that prevents the logging and reporting infrastructure from
325 // picking up sandbox related crashes.
326 // For now, in picking between two evils, we decided in favor of
327 // LOG(FATAL). In the long run, we probably want to rewrite this code
328 // to be async-signal safe.
333 // If there is no logging infrastructure in place, we just write error
334 // messages to stderr.
335 // We also write to stderr, if we are called in a child process from
336 // supportsSeccompSandbox(). This makes sure we can actually do the
337 // correct logging from the parent process, which is more likely to
338 // have access to logging infrastructure.
339 if (HANDLE_EINTR(write(2, msg
, strlen(msg
)))) { }
340 if (HANDLE_EINTR(write(2, "\n", 1))) { }
344 // exit_group() should exit our program. After all, it is defined as a
345 // function that doesn't return. But things can theoretically go wrong.
346 // Especially, since we are dealing with system call filters. Continuing
347 // execution would be very bad in most cases where die() gets called.
348 // So, if there is no way for us to ask for the program to exit, the next
349 // best thing we can do is to loop indefinitely. Maybe, somebody will
350 // notice and file a bug...
351 syscall(__NR_exit_group
, 1);
356 // Get a file descriptor pointing to "/proc", if currently available.
357 static int getProcFd() { return proc_fd_
; }
361 friend class Verifier
;
363 Range(uint32_t f
, uint32_t t
, const ErrorCode
& e
) :
372 FixUp(unsigned int a
, bool j
) :
377 typedef std::vector
<Range
> Ranges
;
378 typedef std::map
<uint32_t, std::vector
<FixUp
> > RetInsns
;
379 typedef std::vector
<struct sock_filter
> Program
;
380 typedef std::vector
<ErrorCode
> Traps
;
381 typedef std::map
<std::pair
<TrapFnc
, const void *>, int> TrapIds
;
383 static ErrorCode
probeEvaluator(int signo
) __attribute__((const));
384 static void probeProcess(void);
385 static ErrorCode
allowAllEvaluator(int signo
);
386 static void tryVsyscallProcess(void);
387 static bool kernelSupportSeccompBPF(int proc_fd
);
388 static bool RunFunctionInPolicy(void (*function
)(),
389 EvaluateSyscall syscallEvaluator
,
391 static bool isSingleThreaded(int proc_fd
);
392 static bool disableFilesystem();
393 static void policySanityChecks(EvaluateSyscall syscallEvaluator
,
394 EvaluateArguments argumentEvaluator
);
395 static void installFilter();
396 static void findRanges(Ranges
*ranges
);
397 static void emitJumpStatements(Program
*program
, RetInsns
*rets
,
398 Ranges::const_iterator start
,
399 Ranges::const_iterator stop
);
400 static void emitReturnStatements(Program
*prog
, const RetInsns
& rets
);
401 static void sigSys(int nr
, siginfo_t
*info
, void *void_context
);
402 static intptr_t bpfFailure(const struct arch_seccomp_data
& data
, void *aux
);
403 static int getTrapId(TrapFnc fnc
, const void *aux
);
406 static SandboxStatus status_
;
408 static Evaluators evaluators_
;
409 static Traps
*traps_
;
410 static TrapIds trapIds_
;
411 static ErrorCode
*trapArray_
;
412 static size_t trapArraySize_
;
413 DISALLOW_IMPLICIT_CONSTRUCTORS(Sandbox
);
418 #endif // SANDBOX_BPF_H__