samples/seccomp/user-trap.c

   1 #include <signal.h>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <unistd.h>
   5 #include <errno.h>
   6 #include <fcntl.h>
   7 #include <string.h>
   8 #include <stddef.h>
   9 #include <sys/sysmacros.h>
  10 #include <sys/types.h>
  11 #include <sys/wait.h>
  12 #include <sys/socket.h>
  13 #include <sys/stat.h>
  14 #include <sys/mman.h>
  15 #include <sys/syscall.h>
  16 #include <sys/user.h>
  17 #include <sys/ioctl.h>
  18 #include <sys/ptrace.h>
  19 #include <sys/mount.h>
  20 #include <linux/limits.h>
  21 #include <linux/filter.h>
  22 #include <linux/seccomp.h>
  23
  24 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  25
  26 static int seccomp(unsigned int op, unsigned int flags, void *args)
  27 {
  28         errno = 0;
  29         return syscall(__NR_seccomp, op, flags, args);
  30 }
  31
  32 static int send_fd(int sock, int fd)
  33 {
  34         struct msghdr msg = {};
  35         struct cmsghdr *cmsg;
  36         char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  37         struct iovec io = {
  38                 .iov_base = &c,
  39                 .iov_len = 1,
  40         };
  41
  42         msg.msg_iov = &io;
  43         msg.msg_iovlen = 1;
  44         msg.msg_control = buf;
  45         msg.msg_controllen = sizeof(buf);
  46         cmsg = CMSG_FIRSTHDR(&msg);
  47         cmsg->cmsg_level = SOL_SOCKET;
  48         cmsg->cmsg_type = SCM_RIGHTS;
  49         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
  50         *((int *)CMSG_DATA(cmsg)) = fd;
  51         msg.msg_controllen = cmsg->cmsg_len;
  52
  53         if (sendmsg(sock, &msg, 0) < 0) {
  54                 perror("sendmsg");
  55                 return -1;
  56         }
  57
  58         return 0;
  59 }
  60
  61 static int recv_fd(int sock)
  62 {
  63         struct msghdr msg = {};
  64         struct cmsghdr *cmsg;
  65         char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  66         struct iovec io = {
  67                 .iov_base = &c,
  68                 .iov_len = 1,
  69         };
  70
  71         msg.msg_iov = &io;
  72         msg.msg_iovlen = 1;
  73         msg.msg_control = buf;
  74         msg.msg_controllen = sizeof(buf);
  75
  76         if (recvmsg(sock, &msg, 0) < 0) {
  77                 perror("recvmsg");
  78                 return -1;
  79         }
  80
  81         cmsg = CMSG_FIRSTHDR(&msg);
  82
  83         return *((int *)CMSG_DATA(cmsg));
  84 }
  85
  86 static int user_trap_syscall(int nr, unsigned int flags)
  87 {
  88         struct sock_filter filter[] = {
  89                 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
  90                         offsetof(struct seccomp_data, nr)),
  91                 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
  92                 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
  93                 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
  94         };
  95
  96         struct sock_fprog prog = {
  97                 .len = (unsigned short)ARRAY_SIZE(filter),
  98                 .filter = filter,
  99         };
 100
 101         return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
 102 }
 103
 104 static int handle_req(struct seccomp_notif *req,
 105                       struct seccomp_notif_resp *resp, int listener)
 106 {
 107         char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
 108         int ret = -1, mem;
 109
 110         resp->id = req->id;
 111         resp->error = -EPERM;
 112         resp->val = 0;
 113
 114         if (req->data.nr != __NR_mount) {
 115                 fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
 116                 return -1;
 117         }
 118
 119         /* Only allow bind mounts. */
 120         if (!(req->data.args[3] & MS_BIND))
 121                 return 0;
 122
 123         /*
 124          * Ok, let's read the task's memory to see where they wanted their
 125          * mount to go.
 126          */
 127         snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
 128         mem = open(path, O_RDONLY);
 129         if (mem < 0) {
 130                 perror("open mem");
 131                 return -1;
 132         }
 133
 134         /*
 135          * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
 136          * the pid that made the syscall may have died, we need to confirm that
 137          * the pid is still valid after we open its /proc/pid/mem file. We can
 138          * ask the listener fd this as follows.
 139          *
 140          * Note that this check should occur *after* any task-specific
 141          * resources are opened, to make sure that the task has not died and
 142          * we're not wrongly reading someone else's state in order to make
 143          * decisions.
 144          */
 145         if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
 146                 fprintf(stderr, "task died before we could map its memory\n");
 147                 goto out;
 148         }
 149
 150         /*
 151          * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
 152          * that to avoid another TOCTOU, we should read all of the pointer args
 153          * before we decide to allow the syscall.
 154          */
 155         if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
 156                 perror("seek");
 157                 goto out;
 158         }
 159
 160         ret = read(mem, source, sizeof(source));
 161         if (ret < 0) {
 162                 perror("read");
 163                 goto out;
 164         }
 165
 166         if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
 167                 perror("seek");
 168                 goto out;
 169         }
 170
 171         ret = read(mem, target, sizeof(target));
 172         if (ret < 0) {
 173                 perror("read");
 174                 goto out;
 175         }
 176
 177         /*
 178          * Our policy is to only allow bind mounts inside /tmp. This isn't very
 179          * interesting, because we could do unprivlieged bind mounts with user
 180          * namespaces already, but you get the idea.
 181          */
 182         if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
 183                 if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
 184                         ret = -1;
 185                         perror("actual mount");
 186                         goto out;
 187                 }
 188                 resp->error = 0;
 189         }
 190
 191         /* Even if we didn't allow it because of policy, generating the
 192          * response was be a success, because we want to tell the worker EPERM.
 193          */
 194         ret = 0;
 195
 196 out:
 197         close(mem);
 198         return ret;
 199 }
 200
 201 int main(void)
 202 {
 203         int sk_pair[2], ret = 1, status, listener;
 204         pid_t worker = 0 , tracer = 0;
 205
 206         if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
 207                 perror("socketpair");
 208                 return 1;
 209         }
 210
 211         worker = fork();
 212         if (worker < 0) {
 213                 perror("fork");
 214                 goto close_pair;
 215         }
 216
 217         if (worker == 0) {
 218                 listener = user_trap_syscall(__NR_mount,
 219                                              SECCOMP_FILTER_FLAG_NEW_LISTENER);
 220                 if (listener < 0) {
 221                         perror("seccomp");
 222                         exit(1);
 223                 }
 224
 225                 /*
 226                  * Drop privileges. We definitely can't mount as uid 1000.
 227                  */
 228                 if (setuid(1000) < 0) {
 229                         perror("setuid");
 230                         exit(1);
 231                 }
 232
 233                 /*
 234                  * Send the listener to the parent; also serves as
 235                  * synchronization.
 236                  */
 237                 if (send_fd(sk_pair[1], listener) < 0)
 238                         exit(1);
 239                 close(listener);
 240
 241                 if (mkdir("/tmp/foo", 0755) < 0) {
 242                         perror("mkdir");
 243                         exit(1);
 244                 }
 245
 246                 /*
 247                  * Try a bad mount just for grins.
 248                  */
 249                 if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
 250                         fprintf(stderr, "huh? mounted /dev/sda?\n");
 251                         exit(1);
 252                 }
 253
 254                 if (errno != EPERM) {
 255                         perror("bad error from mount");
 256                         exit(1);
 257                 }
 258
 259                 /*
 260                  * Ok, we expect this one to succeed.
 261                  */
 262                 if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
 263                         perror("mount");
 264                         exit(1);
 265                 }
 266
 267                 exit(0);
 268         }
 269
 270         /*
 271          * Get the listener from the child.
 272          */
 273         listener = recv_fd(sk_pair[0]);
 274         if (listener < 0)
 275                 goto out_kill;
 276
 277         /*
 278          * Fork a task to handle the requests. This isn't strictly necessary,
 279          * but it makes the particular writing of this sample easier, since we
 280          * can just wait ofr the tracee to exit and kill the tracer.
 281          */
 282         tracer = fork();
 283         if (tracer < 0) {
 284                 perror("fork");
 285                 goto out_kill;
 286         }
 287
 288         if (tracer == 0) {
 289                 struct seccomp_notif *req;
 290                 struct seccomp_notif_resp *resp;
 291                 struct seccomp_notif_sizes sizes;
 292
 293                 if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
 294                         perror("seccomp(GET_NOTIF_SIZES)");
 295                         goto out_close;
 296                 }
 297
 298                 req = malloc(sizes.seccomp_notif);
 299                 if (!req)
 300                         goto out_close;
 301                 memset(req, 0, sizeof(*req));
 302
 303                 resp = malloc(sizes.seccomp_notif_resp);
 304                 if (!resp)
 305                         goto out_req;
 306                 memset(resp, 0, sizeof(*resp));
 307
 308                 while (1) {
 309                         if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
 310                                 perror("ioctl recv");
 311                                 goto out_resp;
 312                         }
 313
 314                         if (handle_req(req, resp, listener) < 0)
 315                                 goto out_resp;
 316
 317                         /*
 318                          * ENOENT here means that the task may have gotten a
 319                          * signal and restarted the syscall. It's up to the
 320                          * handler to decide what to do in this case, but for
 321                          * the sample code, we just ignore it. Probably
 322                          * something better should happen, like undoing the
 323                          * mount, or keeping track of the args to make sure we
 324                          * don't do it again.
 325                          */
 326                         if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
 327                             errno != ENOENT) {
 328                                 perror("ioctl send");
 329                                 goto out_resp;
 330                         }
 331                 }
 332 out_resp:
 333                 free(resp);
 334 out_req:
 335                 free(req);
 336 out_close:
 337                 close(listener);
 338                 exit(1);
 339         }
 340
 341         close(listener);
 342
 343         if (waitpid(worker, &status, 0) != worker) {
 344                 perror("waitpid");
 345                 goto out_kill;
 346         }
 347
 348         if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
 349                 perror("umount2");
 350                 goto out_kill;
 351         }
 352
 353         if (remove("/tmp/foo") < 0 && errno != ENOENT) {
 354                 perror("remove");
 355                 exit(1);
 356         }
 357
 358         if (!WIFEXITED(status) || WEXITSTATUS(status)) {
 359                 fprintf(stderr, "worker exited nonzero\n");
 360                 goto out_kill;
 361         }
 362
 363         ret = 0;
 364
 365 out_kill:
 366         if (tracer > 0)
 367                 kill(tracer, SIGKILL);
 368         if (worker > 0)
 369                 kill(worker, SIGKILL);
 370
 371 close_pair:
 372         close(sk_pair[0]);
 373         close(sk_pair[1]);
 374         return ret;
 375 }