Add a constructor that takes only the "interesting" args (basically the args that...
[chromium-blink-merge.git] / sandbox / linux / seccomp-bpf / sandbox_bpf.h
blobeb99d9920009f5c447ed7a321d34b70d343a4b02
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef SANDBOX_BPF_H__
6 #define SANDBOX_BPF_H__
8 #include <endian.h>
9 #include <errno.h>
10 #include <fcntl.h>
11 #include <linux/audit.h>
12 #include <linux/filter.h>
13 // #include <linux/seccomp.h>
14 #include <linux/unistd.h>
15 #include <netinet/in.h>
16 #include <netinet/tcp.h>
17 #include <netinet/udp.h>
18 #include <sched.h>
19 #include <signal.h>
20 #include <stddef.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/ioctl.h>
26 #include <sys/ipc.h>
27 #include <sys/mman.h>
28 #include <sys/prctl.h>
29 #include <sys/shm.h>
30 #include <sys/stat.h>
31 #include <sys/types.h>
32 #include <sys/uio.h>
33 #include <sys/wait.h>
34 #include <unistd.h>
36 #include <algorithm>
37 #include <limits>
38 #include <map>
39 #include <utility>
40 #include <vector>
42 #ifndef SECCOMP_BPF_STANDALONE
43 #include "base/basictypes.h"
44 #include "base/eintr_wrapper.h"
45 #include "base/logging.h"
46 #endif
48 // The Seccomp2 kernel ABI is not part of older versions of glibc.
49 // As we can't break compilation with these versions of the library,
50 // we explicitly define all missing symbols.
52 #ifndef PR_SET_NO_NEW_PRIVS
53 #define PR_SET_NO_NEW_PRIVS 38
54 #define PR_GET_NO_NEW_PRIVS 39
55 #endif
56 #ifndef IPC_64
57 #define IPC_64 0x0100
58 #endif
59 #ifndef SECCOMP_MODE_FILTER
60 #define SECCOMP_MODE_DISABLED 0
61 #define SECCOMP_MODE_STRICT 1
62 #define SECCOMP_MODE_FILTER 2 // User user-supplied filter
63 #define SECCOMP_RET_KILL 0x00000000U // Kill the task immediately
64 #define SECCOMP_RET_TRAP 0x00030000U // Disallow and force a SIGSYS
65 #define SECCOMP_RET_ERRNO 0x00050000U // Returns an errno
66 #define SECCOMP_RET_TRACE 0x7ff00000U // Pass to a tracer or disallow
67 #define SECCOMP_RET_ALLOW 0x7fff0000U // Allow
68 #define SECCOMP_RET_INVALID 0x8f8f8f8fU // Illegal return value
69 #define SECCOMP_RET_ACTION 0xffff0000U // Masks for the return value
70 #define SECCOMP_RET_DATA 0x0000ffffU // sections
71 #endif
72 #define SECCOMP_DENY_ERRNO EPERM
73 #ifndef SYS_SECCOMP
74 #define SYS_SECCOMP 1
75 #endif
77 // Impose some reasonable maximum BPF program size. Realistically, the
78 // kernel probably has much lower limits. But by limiting to less than
79 // 30 bits, we can ease requirements on some of our data types.
80 #define SECCOMP_MAX_PROGRAM_SIZE (1<<30)
82 #if defined(__i386__)
83 #define MIN_SYSCALL 0u
84 #define MAX_SYSCALL 1024u
85 #define SECCOMP_ARCH AUDIT_ARCH_I386
87 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
88 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_EAX)
89 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_EAX)
90 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_EIP)
91 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_EBX)
92 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_ECX)
93 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_EDX)
94 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_ESI)
95 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_EDI)
96 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_EBP)
98 #elif defined(__x86_64__)
99 #define MIN_SYSCALL 0u
100 #define MAX_SYSCALL 1024u
101 #define SECCOMP_ARCH AUDIT_ARCH_X86_64
103 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.gregs[(_reg)])
104 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, REG_RAX)
105 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, REG_RAX)
106 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, REG_RIP)
107 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, REG_RDI)
108 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, REG_RSI)
109 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, REG_RDX)
110 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, REG_R10)
111 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, REG_R8)
112 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, REG_R9)
114 #elif defined(__arm__) && (defined(__thumb__) || defined(__ARM_EABI__))
115 // ARM EABI includes "ARM private" system calls starting at |__ARM_NR_BASE|,
116 // and a "ghost syscall private to the kernel", cmpxchg,
117 // at |__ARM_NR_BASE+0x00fff0|.
118 // See </arch/arm/include/asm/unistd.h> in the Linux kernel.
119 #define MIN_SYSCALL ((unsigned int)__NR_SYSCALL_BASE)
120 #define MAX_SYSCALL ((unsigned int)__ARM_NR_BASE + 0x00ffffu)
121 // <linux/audit.h> includes <linux/elf-em.h>, which does not define EM_ARM.
122 // <linux/elf.h> only includes <asm/elf.h> if we're in the kernel.
123 # if !defined(EM_ARM)
124 # define EM_ARM 40
125 # endif
126 #define SECCOMP_ARCH AUDIT_ARCH_ARM
128 // ARM sigcontext_t is different from i386/x86_64.
129 // See </arch/arm/include/asm/sigcontext.h> in the Linux kernel.
130 #define SECCOMP_REG(_ctx, _reg) ((_ctx)->uc_mcontext.arm_##_reg)
131 // ARM EABI syscall convention.
132 #define SECCOMP_RESULT(_ctx) SECCOMP_REG(_ctx, r0)
133 #define SECCOMP_SYSCALL(_ctx) SECCOMP_REG(_ctx, r7)
134 #define SECCOMP_IP(_ctx) SECCOMP_REG(_ctx, pc)
135 #define SECCOMP_PARM1(_ctx) SECCOMP_REG(_ctx, r0)
136 #define SECCOMP_PARM2(_ctx) SECCOMP_REG(_ctx, r1)
137 #define SECCOMP_PARM3(_ctx) SECCOMP_REG(_ctx, r2)
138 #define SECCOMP_PARM4(_ctx) SECCOMP_REG(_ctx, r3)
139 #define SECCOMP_PARM5(_ctx) SECCOMP_REG(_ctx, r4)
140 #define SECCOMP_PARM6(_ctx) SECCOMP_REG(_ctx, r5)
142 #else
143 #error Unsupported target platform
145 #endif
147 struct arch_seccomp_data {
148 int nr;
149 uint32_t arch;
150 uint64_t instruction_pointer;
151 uint64_t args[6];
154 struct arch_sigsys {
155 void *ip;
156 int nr;
157 unsigned int arch;
160 #ifdef SECCOMP_BPF_STANDALONE
161 #define arraysize(x) sizeof(x)/sizeof(*(x)))
162 #define HANDLE_EINTR TEMP_FAILURE_RETRY
163 #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
164 TypeName(); \
165 TypeName(const TypeName&); \
166 void operator=(const TypeName&)
167 #endif
170 namespace playground2 {
172 class Sandbox {
173 public:
174 enum SandboxStatus {
175 STATUS_UNKNOWN, // Status prior to calling supportsSeccompSandbox()
176 STATUS_UNSUPPORTED, // The kernel does not appear to support sandboxing
177 STATUS_UNAVAILABLE, // Currently unavailable but might work again later
178 STATUS_AVAILABLE, // Sandboxing is available but not currently active
179 STATUS_ENABLED // The sandbox is now active
182 enum {
183 SB_INVALID = -1,
184 SB_ALLOWED = 0x0000,
185 SB_INSPECT_ARG_1 = 0x8001,
186 SB_INSPECT_ARG_2 = 0x8002,
187 SB_INSPECT_ARG_3 = 0x8004,
188 SB_INSPECT_ARG_4 = 0x8008,
189 SB_INSPECT_ARG_5 = 0x8010,
190 SB_INSPECT_ARG_6 = 0x8020
193 // TrapFnc is a pointer to a function that handles Seccomp traps in
194 // user-space. The seccomp policy can request that a trap handler gets
195 // installed; it does so by returning a suitable ErrorCode() from the
196 // syscallEvaluator. See the ErrorCode() constructor for how to pass in
197 // the function pointer.
198 // Please note that TrapFnc is executed from signal context and must be
199 // async-signal safe:
200 // http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html
201 typedef intptr_t (*TrapFnc)(const struct arch_seccomp_data& args, void *aux);
203 class ErrorCode {
204 friend class Sandbox;
205 public:
206 // We can either wrap a symbolic ErrorCode (i.e. enum values), an errno
207 // value (in the range 1..4095), or a pointer to a TrapFnc callback
208 // handling a SECCOMP_RET_TRAP trap.
209 // All of these different values are stored in the "err_" field. So, code
210 // that is using the ErrorCode class typically operates on a single 32bit
211 // field.
212 // This is not only quiet efficient, it also makes the API really easy to
213 // use.
214 ErrorCode(int err = SB_INVALID)
215 : id_(0),
216 fnc_(NULL),
217 aux_(NULL) {
218 switch (err) {
219 case SB_INVALID:
220 err_ = SECCOMP_RET_INVALID;
221 break;
222 case SB_ALLOWED:
223 err_ = SECCOMP_RET_ALLOW;
224 break;
225 case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6:
226 die("Not implemented");
227 break;
228 case 1 ... 4095:
229 err_ = SECCOMP_RET_ERRNO + err;
230 break;
231 default:
232 die("Invalid use of ErrorCode object");
236 // If we are wrapping a callback, we must assign a unique id. This id is
237 // how the kernel tells us which one of our different SECCOMP_RET_TRAP
238 // cases has been triggered.
239 // The getTrapId() function assigns one unique id (starting at 1) for
240 // each distinct pair of TrapFnc and auxiliary data.
241 ErrorCode(TrapFnc fnc, const void *aux, int id = 0) :
242 id_(id ? id : getTrapId(fnc, aux)),
243 fnc_(fnc),
244 aux_(const_cast<void *>(aux)),
245 err_(SECCOMP_RET_TRAP + id_) {
248 // Destructor doesn't need to do anything.
249 ~ErrorCode() { }
251 // Always return the value that goes into the BPF filter program.
252 operator uint32_t() const { return err_; }
254 protected:
255 // Fields needed for SECCOMP_RET_TRAP callbacks
256 int id_;
257 TrapFnc fnc_;
258 void *aux_;
260 // 32bit field used for all possible types of ErrorCode values
261 uint32_t err_;
264 enum Operation {
265 OP_NOP, OP_EQUAL, OP_NOTEQUAL, OP_LESS,
266 OP_LESS_EQUAL, OP_GREATER, OP_GREATER_EQUAL,
267 OP_HAS_BITS, OP_DOES_NOT_HAVE_BITS
270 struct Constraint {
271 bool is32bit;
272 Operation op;
273 uint32_t value;
274 ErrorCode passed;
275 ErrorCode failed;
278 typedef ErrorCode (*EvaluateSyscall)(int sysno);
279 typedef int (*EvaluateArguments)(int sysno, int arg,
280 Constraint *constraint);
281 typedef std::vector<std::pair<EvaluateSyscall,EvaluateArguments> >Evaluators;
283 // There are a lot of reasons why the Seccomp sandbox might not be available.
284 // This could be because the kernel does not support Seccomp mode, or it
285 // could be because another sandbox is already active.
286 // "proc_fd" should be a file descriptor for "/proc", or -1 if not
287 // provided by the caller.
288 static SandboxStatus supportsSeccompSandbox(int proc_fd);
290 // The sandbox needs to be able to access files in "/proc/self". If this
291 // directory is not accessible when "startSandbox()" gets called, the caller
292 // can provide an already opened file descriptor by calling "setProcFd()".
293 // The sandbox becomes the new owner of this file descriptor and will
294 // eventually close it when "startSandbox()" executes.
295 static void setProcFd(int proc_fd);
297 // The system call evaluator function is called with the system
298 // call number. It can decide to allow the system call unconditionally
299 // by returning "0"; it can deny the system call unconditionally by
300 // returning an appropriate "errno" value; or it can request inspection
301 // of system call argument(s) by returning a suitable combination of
302 // SB_INSPECT_ARG_x bits.
303 // The system argument evaluator is called (if needed) to query additional
304 // constraints for the system call arguments. In the vast majority of
305 // cases, it will set a "Constraint" that forces a new "errno" value.
306 // But for more complex filters, it is possible to return another mask
307 // of SB_INSPECT_ARG_x bits.
308 static void setSandboxPolicy(EvaluateSyscall syscallEvaluator,
309 EvaluateArguments argumentEvaluator);
311 // This is the main public entry point. It finds all system calls that
312 // need rewriting, sets up the resources needed by the sandbox, and
313 // enters Seccomp mode.
314 static void startSandbox();
316 protected:
317 // Print an error message and terminate the program. Used for fatal errors.
318 static void die(const char *msg) __attribute__((noreturn)) {
319 if (msg) {
320 #ifndef SECCOMP_BPF_STANDALONE
321 if (!dryRun_) {
322 // LOG(FATAL) is not neccessarily async-signal safe. It would be
323 // better to always use the code for the SECCOMP_BPF_STANDALONE case.
324 // But that prevents the logging and reporting infrastructure from
325 // picking up sandbox related crashes.
326 // For now, in picking between two evils, we decided in favor of
327 // LOG(FATAL). In the long run, we probably want to rewrite this code
328 // to be async-signal safe.
329 LOG(FATAL) << msg;
330 } else
331 #endif
333 // If there is no logging infrastructure in place, we just write error
334 // messages to stderr.
335 // We also write to stderr, if we are called in a child process from
336 // supportsSeccompSandbox(). This makes sure we can actually do the
337 // correct logging from the parent process, which is more likely to
338 // have access to logging infrastructure.
339 if (HANDLE_EINTR(write(2, msg, strlen(msg)))) { }
340 if (HANDLE_EINTR(write(2, "\n", 1))) { }
343 for (;;) {
344 // exit_group() should exit our program. After all, it is defined as a
345 // function that doesn't return. But things can theoretically go wrong.
346 // Especially, since we are dealing with system call filters. Continuing
347 // execution would be very bad in most cases where die() gets called.
348 // So, if there is no way for us to ask for the program to exit, the next
349 // best thing we can do is to loop indefinitely. Maybe, somebody will
350 // notice and file a bug...
351 syscall(__NR_exit_group, 1);
352 _exit(1);
356 // Get a file descriptor pointing to "/proc", if currently available.
357 static int getProcFd() { return proc_fd_; }
359 private:
360 friend class Util;
361 friend class Verifier;
362 struct Range {
363 Range(uint32_t f, uint32_t t, const ErrorCode& e) :
364 from(f),
365 to(t),
366 err(e) {
368 uint32_t from, to;
369 ErrorCode err;
371 struct FixUp {
372 FixUp(unsigned int a, bool j) :
373 jt(j), addr(a) { }
374 bool jt:1;
375 unsigned addr:31;
377 typedef std::vector<Range> Ranges;
378 typedef std::map<uint32_t, std::vector<FixUp> > RetInsns;
379 typedef std::vector<struct sock_filter> Program;
380 typedef std::vector<ErrorCode> Traps;
381 typedef std::map<std::pair<TrapFnc, const void *>, int> TrapIds;
383 static ErrorCode probeEvaluator(int signo) __attribute__((const));
384 static void probeProcess(void);
385 static ErrorCode allowAllEvaluator(int signo);
386 static void tryVsyscallProcess(void);
387 static bool kernelSupportSeccompBPF(int proc_fd);
388 static bool RunFunctionInPolicy(void (*function)(),
389 EvaluateSyscall syscallEvaluator,
390 int proc_fd);
391 static bool isSingleThreaded(int proc_fd);
392 static bool disableFilesystem();
393 static void policySanityChecks(EvaluateSyscall syscallEvaluator,
394 EvaluateArguments argumentEvaluator);
395 static void installFilter();
396 static void findRanges(Ranges *ranges);
397 static void emitJumpStatements(Program *program, RetInsns *rets,
398 Ranges::const_iterator start,
399 Ranges::const_iterator stop);
400 static void emitReturnStatements(Program *prog, const RetInsns& rets);
401 static void sigSys(int nr, siginfo_t *info, void *void_context);
402 static intptr_t bpfFailure(const struct arch_seccomp_data& data, void *aux);
403 static int getTrapId(TrapFnc fnc, const void *aux);
405 static bool dryRun_;
406 static SandboxStatus status_;
407 static int proc_fd_;
408 static Evaluators evaluators_;
409 static Traps *traps_;
410 static TrapIds trapIds_;
411 static ErrorCode *trapArray_;
412 static size_t trapArraySize_;
413 DISALLOW_IMPLICIT_CONSTRUCTORS(Sandbox);
416 } // namespace
418 #endif // SANDBOX_BPF_H__