2 * Copyright (c) 2012 Will Drewry <wad@dataspill.org>
3 * Copyright (c) 2015,2017,2019,2020,2023 Damien Miller <djm@mindrot.org>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 * Uncomment the SANDBOX_SECCOMP_FILTER_DEBUG macro below to help diagnose
20 * filter breakage during development. *Do not* use this in production,
21 * as it relies on making library calls that are unsafe in signal context.
23 * Instead, live systems the auditctl(8) may be used to monitor failures.
25 * auditctl -a task,always -F uid=<privsep uid>
27 /* #define SANDBOX_SECCOMP_FILTER_DEBUG 1 */
31 * For older toolchains, it may be necessary to use the kernel
34 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
35 # include <asm/siginfo.h>
36 # define __have_siginfo_t 1
37 # define __have_sigval_t 1
38 # define __have_sigevent_t 1
39 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
44 #ifdef SANDBOX_SECCOMP_FILTER
46 #include <sys/types.h>
47 #include <sys/resource.h>
48 #include <sys/prctl.h>
50 #include <sys/syscall.h>
52 #include <linux/futex.h>
53 #include <linux/net.h>
54 #include <linux/audit.h>
55 #include <linux/filter.h>
56 #include <linux/seccomp.h>
59 #include <asm/unistd.h>
61 #include <asm/zcrypt.h>
67 #include <stddef.h> /* for offsetof */
74 #include "ssh-sandbox.h"
77 /* Linux seccomp_filter sandbox */
78 #define SECCOMP_FILTER_FAIL SECCOMP_RET_KILL
80 /* Use a signal handler to emit violations when debugging */
81 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
82 # undef SECCOMP_FILTER_FAIL
83 # define SECCOMP_FILTER_FAIL SECCOMP_RET_TRAP
84 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
86 #if __BYTE_ORDER == __LITTLE_ENDIAN
87 # define ARG_LO_OFFSET 0
88 # define ARG_HI_OFFSET sizeof(uint32_t)
89 #elif __BYTE_ORDER == __BIG_ENDIAN
90 # define ARG_LO_OFFSET sizeof(uint32_t)
91 # define ARG_HI_OFFSET 0
93 #error "Unknown endianness"
96 /* Simple helpers to avoid manual errors (but larger BPF programs). */
97 #define SC_DENY(_nr, _errno) \
98 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 1), \
99 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO|(_errno))
100 #define SC_ALLOW(_nr) \
101 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 1), \
102 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
103 #define SC_ALLOW_ARG(_nr, _arg_nr, _arg_val) \
104 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 6), \
105 /* load and test syscall argument, low word */ \
106 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
107 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_LO_OFFSET), \
108 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, \
109 ((_arg_val) & 0xFFFFFFFF), 0, 3), \
110 /* load and test syscall argument, high word */ \
111 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
112 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_HI_OFFSET), \
113 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, \
114 (((uint32_t)((uint64_t)(_arg_val) >> 32)) & 0xFFFFFFFF), 0, 1), \
115 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), \
116 /* reload syscall number; all rules expect it in accumulator */ \
117 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
118 offsetof(struct seccomp_data, nr))
119 /* Allow if syscall argument contains only values in mask */
120 #define SC_ALLOW_ARG_MASK(_nr, _arg_nr, _arg_mask) \
121 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 8), \
122 /* load, mask and test syscall argument, low word */ \
123 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
124 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_LO_OFFSET), \
125 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, ~((_arg_mask) & 0xFFFFFFFF)), \
126 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 4), \
127 /* load, mask and test syscall argument, high word */ \
128 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
129 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_HI_OFFSET), \
130 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, \
131 ~(((uint32_t)((uint64_t)(_arg_mask) >> 32)) & 0xFFFFFFFF)), \
132 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 1), \
133 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), \
134 /* reload syscall number; all rules expect it in accumulator */ \
135 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
136 offsetof(struct seccomp_data, nr))
137 /* Deny unless syscall argument contains only values in mask */
138 #define SC_DENY_UNLESS_ARG_MASK(_nr, _arg_nr, _arg_mask, _errno) \
139 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 8), \
140 /* load, mask and test syscall argument, low word */ \
141 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
142 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_LO_OFFSET), \
143 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, ~((_arg_mask) & 0xFFFFFFFF)), \
144 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 3), \
145 /* load, mask and test syscall argument, high word */ \
146 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
147 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_HI_OFFSET), \
148 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, \
149 ~(((uint32_t)((uint64_t)(_arg_mask) >> 32)) & 0xFFFFFFFF)), \
150 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 1, 0), \
151 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO|(_errno)), \
152 /* reload syscall number; all rules expect it in accumulator */ \
153 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
154 offsetof(struct seccomp_data, nr))
155 #define SC_DENY_UNLESS_MASK(_nr, _arg_nr, _arg_val, _errno) \
156 /* Special handling for futex(2) that combines a bitmap and operation number */
157 #if defined(__NR_futex) || defined(__NR_futex_time64)
158 #define SC_FUTEX_MASK (FUTEX_PRIVATE_FLAG|FUTEX_CLOCK_REALTIME)
159 #define SC_ALLOW_FUTEX_OP(_nr, _op) \
160 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 8), \
161 /* load syscall argument, low word */ \
162 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
163 offsetof(struct seccomp_data, args[1]) + ARG_LO_OFFSET), \
164 /* mask off allowed bitmap values, low word */ \
165 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, ~(SC_FUTEX_MASK & 0xFFFFFFFF)), \
166 /* test operation number, low word */ \
167 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ((_op) & 0xFFFFFFFF), 0, 4), \
168 /* load syscall argument, high word */ \
169 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
170 offsetof(struct seccomp_data, args[1]) + ARG_HI_OFFSET), \
171 /* mask off allowed bitmap values, high word */ \
172 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, \
173 ~(((uint32_t)((uint64_t)SC_FUTEX_MASK >> 32)) & 0xFFFFFFFF)), \
174 /* test operation number, high word */ \
175 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, \
176 (((uint32_t)((uint64_t)(_op) >> 32)) & 0xFFFFFFFF), 0, 1), \
177 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), \
178 /* reload syscall number; all rules expect it in accumulator */ \
179 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr))
181 /* Use this for both __NR_futex and __NR_futex_time64 */
182 # define SC_FUTEX(_nr) \
183 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_WAIT), \
184 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_WAIT_BITSET), \
185 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_WAKE), \
186 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_WAKE_BITSET), \
187 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_REQUEUE), \
188 SC_ALLOW_FUTEX_OP(__NR_futex, FUTEX_CMP_REQUEUE)
189 #endif /* __NR_futex || __NR_futex_time64 */
191 #if defined(__NR_mmap) || defined(__NR_mmap2)
192 # ifdef MAP_FIXED_NOREPLACE
193 # define SC_MMAP_FLAGS MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED|MAP_FIXED_NOREPLACE
195 # define SC_MMAP_FLAGS MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED
196 # endif /* MAP_FIXED_NOREPLACE */
197 /* Use this for both __NR_mmap and __NR_mmap2 variants */
198 # define SC_MMAP(_nr) \
199 SC_DENY_UNLESS_ARG_MASK(_nr, 3, SC_MMAP_FLAGS, EINVAL), \
200 SC_ALLOW_ARG_MASK(_nr, 2, PROT_READ|PROT_WRITE|PROT_NONE)
201 #endif /* __NR_mmap || __NR_mmap2 */
203 /* Syscall filtering set for preauth. */
204 static const struct sock_filter preauth_insns
[] = {
205 /* Ensure the syscall arch convention is as expected. */
206 BPF_STMT(BPF_LD
+BPF_W
+BPF_ABS
,
207 offsetof(struct seccomp_data
, arch
)),
208 BPF_JUMP(BPF_JMP
+BPF_JEQ
+BPF_K
, SECCOMP_AUDIT_ARCH
, 1, 0),
209 BPF_STMT(BPF_RET
+BPF_K
, SECCOMP_FILTER_FAIL
),
210 /* Load the syscall number for checking. */
211 BPF_STMT(BPF_LD
+BPF_W
+BPF_ABS
,
212 offsetof(struct seccomp_data
, nr
)),
214 /* Syscalls to non-fatally deny */
216 SC_DENY(__NR_lstat
, EACCES
),
219 SC_DENY(__NR_lstat64
, EACCES
),
222 SC_DENY(__NR_fstat
, EACCES
),
225 SC_DENY(__NR_fstat64
, EACCES
),
227 #ifdef __NR_fstatat64
228 SC_DENY(__NR_fstatat64
, EACCES
),
231 SC_DENY(__NR_open
, EACCES
),
234 SC_DENY(__NR_openat
, EACCES
),
236 #ifdef __NR_newfstatat
237 SC_DENY(__NR_newfstatat
, EACCES
),
240 SC_DENY(__NR_stat
, EACCES
),
243 SC_DENY(__NR_stat64
, EACCES
),
246 SC_DENY(__NR_shmget
, EACCES
),
249 SC_DENY(__NR_shmat
, EACCES
),
252 SC_DENY(__NR_shmdt
, EACCES
),
255 SC_DENY(__NR_ipc
, EACCES
),
258 SC_DENY(__NR_statx
, EACCES
),
261 /* Syscalls to permit */
265 #ifdef __NR_clock_gettime
266 SC_ALLOW(__NR_clock_gettime
),
268 #ifdef __NR_clock_gettime64
269 SC_ALLOW(__NR_clock_gettime64
),
272 SC_ALLOW(__NR_close
),
277 #ifdef __NR_exit_group
278 SC_ALLOW(__NR_exit_group
),
281 SC_FUTEX(__NR_futex
),
283 #ifdef __NR_futex_time64
284 SC_FUTEX(__NR_futex_time64
),
287 SC_ALLOW(__NR_geteuid
),
289 #ifdef __NR_geteuid32
290 SC_ALLOW(__NR_geteuid32
),
293 SC_ALLOW(__NR_getpgid
),
296 SC_ALLOW(__NR_getpid
),
298 #ifdef __NR_getrandom
299 SC_ALLOW(__NR_getrandom
),
302 SC_ALLOW(__NR_gettid
),
304 #ifdef __NR_gettimeofday
305 SC_ALLOW(__NR_gettimeofday
),
308 SC_ALLOW(__NR_getuid
),
311 SC_ALLOW(__NR_getuid32
),
314 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_NORMAL
),
316 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_FREE
),
318 # ifdef MADV_DONTNEED
319 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_DONTNEED
),
321 # ifdef MADV_DONTFORK
322 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_DONTFORK
),
324 # ifdef MADV_DONTDUMP
325 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_DONTDUMP
),
327 # ifdef MADV_WIPEONFORK
328 SC_ALLOW_ARG(__NR_madvise
, 2, MADV_WIPEONFORK
),
330 SC_DENY(__NR_madvise
, EINVAL
),
339 SC_ALLOW_ARG_MASK(__NR_mprotect
, 2, PROT_READ
|PROT_WRITE
|PROT_NONE
),
342 SC_ALLOW(__NR_mremap
),
345 SC_ALLOW(__NR_munmap
),
347 #ifdef __NR_nanosleep
348 SC_ALLOW(__NR_nanosleep
),
350 #ifdef __NR_clock_nanosleep
351 SC_ALLOW(__NR_clock_nanosleep
),
353 #ifdef __NR_clock_nanosleep_time64
354 SC_ALLOW(__NR_clock_nanosleep_time64
),
356 #ifdef __NR_clock_gettime64
357 SC_ALLOW(__NR_clock_gettime64
),
359 #ifdef __NR__newselect
360 SC_ALLOW(__NR__newselect
),
363 SC_ALLOW(__NR_ppoll
),
365 #ifdef __NR_ppoll_time64
366 SC_ALLOW(__NR_ppoll_time64
),
372 SC_ALLOW(__NR_pselect6
),
374 #ifdef __NR_pselect6_time64
375 SC_ALLOW(__NR_pselect6_time64
),
380 #ifdef __NR_rt_sigprocmask
381 SC_ALLOW(__NR_rt_sigprocmask
),
384 SC_ALLOW(__NR_select
),
387 SC_ALLOW(__NR_shutdown
),
389 #ifdef __NR_sigprocmask
390 SC_ALLOW(__NR_sigprocmask
),
396 SC_ALLOW(__NR_write
),
399 SC_ALLOW(__NR_writev
),
401 #ifdef __NR_socketcall
402 SC_ALLOW_ARG(__NR_socketcall
, 0, SYS_SHUTDOWN
),
403 SC_DENY(__NR_socketcall
, EACCES
),
405 #if defined(__NR_ioctl) && defined(__s390__)
406 /* Allow ioctls for ICA crypto card on s390 */
407 SC_ALLOW_ARG(__NR_ioctl
, 1, Z90STAT_STATUS_MASK
),
408 SC_ALLOW_ARG(__NR_ioctl
, 1, ICARSAMODEXPO
),
409 SC_ALLOW_ARG(__NR_ioctl
, 1, ICARSACRT
),
410 SC_ALLOW_ARG(__NR_ioctl
, 1, ZSECSENDCPRB
),
411 /* Allow ioctls for EP11 crypto card on s390 */
412 SC_ALLOW_ARG(__NR_ioctl
, 1, ZSENDEP11CPRB
),
414 #if defined(__x86_64__) && defined(__ILP32__) && defined(__X32_SYSCALL_BIT)
416 * On Linux x32, the clock_gettime VDSO falls back to the
417 * x86-64 syscall under some circumstances, e.g.
418 * https://bugs.debian.org/849923
420 SC_ALLOW(__NR_clock_gettime
& ~__X32_SYSCALL_BIT
),
424 BPF_STMT(BPF_RET
+BPF_K
, SECCOMP_FILTER_FAIL
),
427 static const struct sock_fprog preauth_program
= {
428 .len
= (unsigned short)(sizeof(preauth_insns
)/sizeof(preauth_insns
[0])),
429 .filter
= (struct sock_filter
*)preauth_insns
,
437 ssh_sandbox_init(struct monitor
*monitor
)
439 struct ssh_sandbox
*box
;
442 * Strictly, we don't need to maintain any state here but we need
443 * to return non-NULL to satisfy the API.
445 debug3("%s: preparing seccomp filter sandbox", __func__
);
446 box
= xcalloc(1, sizeof(*box
));
452 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
453 extern struct monitor
*pmonitor
;
454 void mm_log_handler(LogLevel level
, int forced
, const char *msg
, void *ctx
);
457 ssh_sandbox_violation(int signum
, siginfo_t
*info
, void *void_context
)
461 snprintf(msg
, sizeof(msg
),
462 "%s: unexpected system call (arch:0x%x,syscall:%d @ %p)",
463 __func__
, info
->si_arch
, info
->si_syscall
, info
->si_call_addr
);
464 mm_log_handler(SYSLOG_LEVEL_FATAL
, 0, msg
, pmonitor
);
469 ssh_sandbox_child_debugging(void)
471 struct sigaction act
;
474 debug3("%s: installing SIGSYS handler", __func__
);
475 memset(&act
, 0, sizeof(act
));
477 sigaddset(&mask
, SIGSYS
);
479 act
.sa_sigaction
= &ssh_sandbox_violation
;
480 act
.sa_flags
= SA_SIGINFO
;
481 if (sigaction(SIGSYS
, &act
, NULL
) == -1)
482 fatal("%s: sigaction(SIGSYS): %s", __func__
, strerror(errno
));
483 if (sigprocmask(SIG_UNBLOCK
, &mask
, NULL
) == -1)
484 fatal("%s: sigprocmask(SIGSYS): %s",
485 __func__
, strerror(errno
));
487 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
490 ssh_sandbox_child(struct ssh_sandbox
*box
)
492 struct rlimit rl_zero
, rl_one
= {.rlim_cur
= 1, .rlim_max
= 1};
495 /* Set rlimits for completeness if possible. */
496 rl_zero
.rlim_cur
= rl_zero
.rlim_max
= 0;
497 if (setrlimit(RLIMIT_FSIZE
, &rl_zero
) == -1)
498 fatal("%s: setrlimit(RLIMIT_FSIZE, { 0, 0 }): %s",
499 __func__
, strerror(errno
));
501 * Cannot use zero for nfds, because poll(2) will fail with
502 * errno=EINVAL if npfds>RLIMIT_NOFILE.
504 if (setrlimit(RLIMIT_NOFILE
, &rl_one
) == -1)
505 fatal("%s: setrlimit(RLIMIT_NOFILE, { 0, 0 }): %s",
506 __func__
, strerror(errno
));
507 if (setrlimit(RLIMIT_NPROC
, &rl_zero
) == -1)
508 fatal("%s: setrlimit(RLIMIT_NPROC, { 0, 0 }): %s",
509 __func__
, strerror(errno
));
511 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
512 ssh_sandbox_child_debugging();
513 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
515 debug3("%s: setting PR_SET_NO_NEW_PRIVS", __func__
);
516 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) == -1) {
517 debug("%s: prctl(PR_SET_NO_NEW_PRIVS): %s",
518 __func__
, strerror(errno
));
521 debug3("%s: attaching seccomp filter program", __func__
);
522 if (prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, &preauth_program
) == -1)
523 debug("%s: prctl(PR_SET_SECCOMP): %s",
524 __func__
, strerror(errno
));
526 fatal("%s: SECCOMP_MODE_FILTER activated but "
527 "PR_SET_NO_NEW_PRIVS failed", __func__
);
531 ssh_sandbox_parent_finish(struct ssh_sandbox
*box
)
534 debug3("%s: finished", __func__
);
538 ssh_sandbox_parent_preauth(struct ssh_sandbox
*box
, pid_t child_pid
)
540 box
->child_pid
= child_pid
;
543 #endif /* SANDBOX_SECCOMP_FILTER */