kernel/bpf/cgroup.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Functions to manage eBPF programs attached to cgroups
   4  *
   5  * Copyright (c) 2016 Daniel Mack
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/atomic.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/filter.h>
  12 #include <linux/slab.h>
  13 #include <linux/sysctl.h>
  14 #include <linux/string.h>
  15 #include <linux/bpf.h>
  16 #include <linux/bpf-cgroup.h>
  17 #include <net/sock.h>
  18 #include <net/bpf_sk_storage.h>
  19
  20 #include "../cgroup/cgroup-internal.h"
  21
  22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
  23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  24
  25 void cgroup_bpf_offline(struct cgroup *cgrp)
  26 {
  27         cgroup_get(cgrp);
  28         percpu_ref_kill(&cgrp->bpf.refcnt);
  29 }
  30
  31 /**
  32  * cgroup_bpf_release() - put references of all bpf programs and
  33  *                        release all cgroup bpf data
  34  * @work: work structure embedded into the cgroup to modify
  35  */
  36 static void cgroup_bpf_release(struct work_struct *work)
  37 {
  38         struct cgroup *cgrp = container_of(work, struct cgroup,
  39                                            bpf.release_work);
  40         enum bpf_cgroup_storage_type stype;
  41         struct bpf_prog_array *old_array;
  42         unsigned int type;
  43
  44         mutex_lock(&cgroup_mutex);
  45
  46         for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
  47                 struct list_head *progs = &cgrp->bpf.progs[type];
  48                 struct bpf_prog_list *pl, *tmp;
  49
  50                 list_for_each_entry_safe(pl, tmp, progs, node) {
  51                         list_del(&pl->node);
  52                         bpf_prog_put(pl->prog);
  53                         for_each_cgroup_storage_type(stype) {
  54                                 bpf_cgroup_storage_unlink(pl->storage[stype]);
  55                                 bpf_cgroup_storage_free(pl->storage[stype]);
  56                         }
  57                         kfree(pl);
  58                         static_branch_dec(&cgroup_bpf_enabled_key);
  59                 }
  60                 old_array = rcu_dereference_protected(
  61                                 cgrp->bpf.effective[type],
  62                                 lockdep_is_held(&cgroup_mutex));
  63                 bpf_prog_array_free(old_array);
  64         }
  65
  66         mutex_unlock(&cgroup_mutex);
  67
  68         percpu_ref_exit(&cgrp->bpf.refcnt);
  69         cgroup_put(cgrp);
  70 }
  71
  72 /**
  73  * cgroup_bpf_release_fn() - callback used to schedule releasing
  74  *                           of bpf cgroup data
  75  * @ref: percpu ref counter structure
  76  */
  77 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
  78 {
  79         struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
  80
  81         INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
  82         queue_work(system_wq, &cgrp->bpf.release_work);
  83 }
  84
  85 /* count number of elements in the list.
  86  * it's slow but the list cannot be long
  87  */
  88 static u32 prog_list_length(struct list_head *head)
  89 {
  90         struct bpf_prog_list *pl;
  91         u32 cnt = 0;
  92
  93         list_for_each_entry(pl, head, node) {
  94                 if (!pl->prog)
  95                         continue;
  96                 cnt++;
  97         }
  98         return cnt;
  99 }
 100
 101 /* if parent has non-overridable prog attached,
 102  * disallow attaching new programs to the descendent cgroup.
 103  * if parent has overridable or multi-prog, allow attaching
 104  */
 105 static bool hierarchy_allows_attach(struct cgroup *cgrp,
 106                                     enum bpf_attach_type type,
 107                                     u32 new_flags)
 108 {
 109         struct cgroup *p;
 110
 111         p = cgroup_parent(cgrp);
 112         if (!p)
 113                 return true;
 114         do {
 115                 u32 flags = p->bpf.flags[type];
 116                 u32 cnt;
 117
 118                 if (flags & BPF_F_ALLOW_MULTI)
 119                         return true;
 120                 cnt = prog_list_length(&p->bpf.progs[type]);
 121                 WARN_ON_ONCE(cnt > 1);
 122                 if (cnt == 1)
 123                         return !!(flags & BPF_F_ALLOW_OVERRIDE);
 124                 p = cgroup_parent(p);
 125         } while (p);
 126         return true;
 127 }
 128
 129 /* compute a chain of effective programs for a given cgroup:
 130  * start from the list of programs in this cgroup and add
 131  * all parent programs.
 132  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 133  * to programs in this cgroup
 134  */
 135 static int compute_effective_progs(struct cgroup *cgrp,
 136                                    enum bpf_attach_type type,
 137                                    struct bpf_prog_array **array)
 138 {
 139         enum bpf_cgroup_storage_type stype;
 140         struct bpf_prog_array *progs;
 141         struct bpf_prog_list *pl;
 142         struct cgroup *p = cgrp;
 143         int cnt = 0;
 144
 145         /* count number of effective programs by walking parents */
 146         do {
 147                 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 148                         cnt += prog_list_length(&p->bpf.progs[type]);
 149                 p = cgroup_parent(p);
 150         } while (p);
 151
 152         progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 153         if (!progs)
 154                 return -ENOMEM;
 155
 156         /* populate the array with effective progs */
 157         cnt = 0;
 158         p = cgrp;
 159         do {
 160                 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
 161                         continue;
 162
 163                 list_for_each_entry(pl, &p->bpf.progs[type], node) {
 164                         if (!pl->prog)
 165                                 continue;
 166
 167                         progs->items[cnt].prog = pl->prog;
 168                         for_each_cgroup_storage_type(stype)
 169                                 progs->items[cnt].cgroup_storage[stype] =
 170                                         pl->storage[stype];
 171                         cnt++;
 172                 }
 173         } while ((p = cgroup_parent(p)));
 174
 175         *array = progs;
 176         return 0;
 177 }
 178
 179 static void activate_effective_progs(struct cgroup *cgrp,
 180                                      enum bpf_attach_type type,
 181                                      struct bpf_prog_array *old_array)
 182 {
 183         rcu_swap_protected(cgrp->bpf.effective[type], old_array,
 184                            lockdep_is_held(&cgroup_mutex));
 185         /* free prog array after grace period, since __cgroup_bpf_run_*()
 186          * might be still walking the array
 187          */
 188         bpf_prog_array_free(old_array);
 189 }
 190
 191 /**
 192  * cgroup_bpf_inherit() - inherit effective programs from parent
 193  * @cgrp: the cgroup to modify
 194  */
 195 int cgroup_bpf_inherit(struct cgroup *cgrp)
 196 {
 197 /* has to use marco instead of const int, since compiler thinks
 198  * that array below is variable length
 199  */
 200 #define NR ARRAY_SIZE(cgrp->bpf.effective)
 201         struct bpf_prog_array *arrays[NR] = {};
 202         int ret, i;
 203
 204         ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
 205                               GFP_KERNEL);
 206         if (ret)
 207                 return ret;
 208
 209         for (i = 0; i < NR; i++)
 210                 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 211
 212         for (i = 0; i < NR; i++)
 213                 if (compute_effective_progs(cgrp, i, &arrays[i]))
 214                         goto cleanup;
 215
 216         for (i = 0; i < NR; i++)
 217                 activate_effective_progs(cgrp, i, arrays[i]);
 218
 219         return 0;
 220 cleanup:
 221         for (i = 0; i < NR; i++)
 222                 bpf_prog_array_free(arrays[i]);
 223
 224         percpu_ref_exit(&cgrp->bpf.refcnt);
 225
 226         return -ENOMEM;
 227 }
 228
 229 static int update_effective_progs(struct cgroup *cgrp,
 230                                   enum bpf_attach_type type)
 231 {
 232         struct cgroup_subsys_state *css;
 233         int err;
 234
 235         /* allocate and recompute effective prog arrays */
 236         css_for_each_descendant_pre(css, &cgrp->self) {
 237                 struct cgroup *desc = container_of(css, struct cgroup, self);
 238
 239                 if (percpu_ref_is_zero(&desc->bpf.refcnt))
 240                         continue;
 241
 242                 err = compute_effective_progs(desc, type, &desc->bpf.inactive);
 243                 if (err)
 244                         goto cleanup;
 245         }
 246
 247         /* all allocations were successful. Activate all prog arrays */
 248         css_for_each_descendant_pre(css, &cgrp->self) {
 249                 struct cgroup *desc = container_of(css, struct cgroup, self);
 250
 251                 if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 252                         if (unlikely(desc->bpf.inactive)) {
 253                                 bpf_prog_array_free(desc->bpf.inactive);
 254                                 desc->bpf.inactive = NULL;
 255                         }
 256                         continue;
 257                 }
 258
 259                 activate_effective_progs(desc, type, desc->bpf.inactive);
 260                 desc->bpf.inactive = NULL;
 261         }
 262
 263         return 0;
 264
 265 cleanup:
 266         /* oom while computing effective. Free all computed effective arrays
 267          * since they were not activated
 268          */
 269         css_for_each_descendant_pre(css, &cgrp->self) {
 270                 struct cgroup *desc = container_of(css, struct cgroup, self);
 271
 272                 bpf_prog_array_free(desc->bpf.inactive);
 273                 desc->bpf.inactive = NULL;
 274         }
 275
 276         return err;
 277 }
 278
 279 #define BPF_CGROUP_MAX_PROGS 64
 280
 281 /**
 282  * __cgroup_bpf_attach() - Attach the program to a cgroup, and
 283  *                         propagate the change to descendants
 284  * @cgrp: The cgroup which descendants to traverse
 285  * @prog: A program to attach
 286  * @type: Type of attach operation
 287  * @flags: Option flags
 288  *
 289  * Must be called with cgroup_mutex held.
 290  */
 291 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 292                         enum bpf_attach_type type, u32 flags)
 293 {
 294         struct list_head *progs = &cgrp->bpf.progs[type];
 295         struct bpf_prog *old_prog = NULL;
 296         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
 297                 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
 298         enum bpf_cgroup_storage_type stype;
 299         struct bpf_prog_list *pl;
 300         bool pl_was_allocated;
 301         int err;
 302
 303         if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
 304                 /* invalid combination */
 305                 return -EINVAL;
 306
 307         if (!hierarchy_allows_attach(cgrp, type, flags))
 308                 return -EPERM;
 309
 310         if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
 311                 /* Disallow attaching non-overridable on top
 312                  * of existing overridable in this cgroup.
 313                  * Disallow attaching multi-prog if overridable or none
 314                  */
 315                 return -EPERM;
 316
 317         if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 318                 return -E2BIG;
 319
 320         for_each_cgroup_storage_type(stype) {
 321                 storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
 322                 if (IS_ERR(storage[stype])) {
 323                         storage[stype] = NULL;
 324                         for_each_cgroup_storage_type(stype)
 325                                 bpf_cgroup_storage_free(storage[stype]);
 326                         return -ENOMEM;
 327                 }
 328         }
 329
 330         if (flags & BPF_F_ALLOW_MULTI) {
 331                 list_for_each_entry(pl, progs, node) {
 332                         if (pl->prog == prog) {
 333                                 /* disallow attaching the same prog twice */
 334                                 for_each_cgroup_storage_type(stype)
 335                                         bpf_cgroup_storage_free(storage[stype]);
 336                                 return -EINVAL;
 337                         }
 338                 }
 339
 340                 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 341                 if (!pl) {
 342                         for_each_cgroup_storage_type(stype)
 343                                 bpf_cgroup_storage_free(storage[stype]);
 344                         return -ENOMEM;
 345                 }
 346
 347                 pl_was_allocated = true;
 348                 pl->prog = prog;
 349                 for_each_cgroup_storage_type(stype)
 350                         pl->storage[stype] = storage[stype];
 351                 list_add_tail(&pl->node, progs);
 352         } else {
 353                 if (list_empty(progs)) {
 354                         pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 355                         if (!pl) {
 356                                 for_each_cgroup_storage_type(stype)
 357                                         bpf_cgroup_storage_free(storage[stype]);
 358                                 return -ENOMEM;
 359                         }
 360                         pl_was_allocated = true;
 361                         list_add_tail(&pl->node, progs);
 362                 } else {
 363                         pl = list_first_entry(progs, typeof(*pl), node);
 364                         old_prog = pl->prog;
 365                         for_each_cgroup_storage_type(stype) {
 366                                 old_storage[stype] = pl->storage[stype];
 367                                 bpf_cgroup_storage_unlink(old_storage[stype]);
 368                         }
 369                         pl_was_allocated = false;
 370                 }
 371                 pl->prog = prog;
 372                 for_each_cgroup_storage_type(stype)
 373                         pl->storage[stype] = storage[stype];
 374         }
 375
 376         cgrp->bpf.flags[type] = flags;
 377
 378         err = update_effective_progs(cgrp, type);
 379         if (err)
 380                 goto cleanup;
 381
 382         static_branch_inc(&cgroup_bpf_enabled_key);
 383         for_each_cgroup_storage_type(stype) {
 384                 if (!old_storage[stype])
 385                         continue;
 386                 bpf_cgroup_storage_free(old_storage[stype]);
 387         }
 388         if (old_prog) {
 389                 bpf_prog_put(old_prog);
 390                 static_branch_dec(&cgroup_bpf_enabled_key);
 391         }
 392         for_each_cgroup_storage_type(stype)
 393                 bpf_cgroup_storage_link(storage[stype], cgrp, type);
 394         return 0;
 395
 396 cleanup:
 397         /* and cleanup the prog list */
 398         pl->prog = old_prog;
 399         for_each_cgroup_storage_type(stype) {
 400                 bpf_cgroup_storage_free(pl->storage[stype]);
 401                 pl->storage[stype] = old_storage[stype];
 402                 bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
 403         }
 404         if (pl_was_allocated) {
 405                 list_del(&pl->node);
 406                 kfree(pl);
 407         }
 408         return err;
 409 }
 410
 411 /**
 412  * __cgroup_bpf_detach() - Detach the program from a cgroup, and
 413  *                         propagate the change to descendants
 414  * @cgrp: The cgroup which descendants to traverse
 415  * @prog: A program to detach or NULL
 416  * @type: Type of detach operation
 417  *
 418  * Must be called with cgroup_mutex held.
 419  */
 420 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 421                         enum bpf_attach_type type)
 422 {
 423         struct list_head *progs = &cgrp->bpf.progs[type];
 424         enum bpf_cgroup_storage_type stype;
 425         u32 flags = cgrp->bpf.flags[type];
 426         struct bpf_prog *old_prog = NULL;
 427         struct bpf_prog_list *pl;
 428         int err;
 429
 430         if (flags & BPF_F_ALLOW_MULTI) {
 431                 if (!prog)
 432                         /* to detach MULTI prog the user has to specify valid FD
 433                          * of the program to be detached
 434                          */
 435                         return -EINVAL;
 436         } else {
 437                 if (list_empty(progs))
 438                         /* report error when trying to detach and nothing is attached */
 439                         return -ENOENT;
 440         }
 441
 442         if (flags & BPF_F_ALLOW_MULTI) {
 443                 /* find the prog and detach it */
 444                 list_for_each_entry(pl, progs, node) {
 445                         if (pl->prog != prog)
 446                                 continue;
 447                         old_prog = prog;
 448                         /* mark it deleted, so it's ignored while
 449                          * recomputing effective
 450                          */
 451                         pl->prog = NULL;
 452                         break;
 453                 }
 454                 if (!old_prog)
 455                         return -ENOENT;
 456         } else {
 457                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
 458                  * allow detaching with invalid FD (prog==NULL)
 459                  */
 460                 pl = list_first_entry(progs, typeof(*pl), node);
 461                 old_prog = pl->prog;
 462                 pl->prog = NULL;
 463         }
 464
 465         err = update_effective_progs(cgrp, type);
 466         if (err)
 467                 goto cleanup;
 468
 469         /* now can actually delete it from this cgroup list */
 470         list_del(&pl->node);
 471         for_each_cgroup_storage_type(stype) {
 472                 bpf_cgroup_storage_unlink(pl->storage[stype]);
 473                 bpf_cgroup_storage_free(pl->storage[stype]);
 474         }
 475         kfree(pl);
 476         if (list_empty(progs))
 477                 /* last program was detached, reset flags to zero */
 478                 cgrp->bpf.flags[type] = 0;
 479
 480         bpf_prog_put(old_prog);
 481         static_branch_dec(&cgroup_bpf_enabled_key);
 482         return 0;
 483
 484 cleanup:
 485         /* and restore back old_prog */
 486         pl->prog = old_prog;
 487         return err;
 488 }
 489
 490 /* Must be called with cgroup_mutex held to avoid races. */
 491 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 492                        union bpf_attr __user *uattr)
 493 {
 494         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 495         enum bpf_attach_type type = attr->query.attach_type;
 496         struct list_head *progs = &cgrp->bpf.progs[type];
 497         u32 flags = cgrp->bpf.flags[type];
 498         struct bpf_prog_array *effective;
 499         int cnt, ret = 0, i;
 500
 501         effective = rcu_dereference_protected(cgrp->bpf.effective[type],
 502                                               lockdep_is_held(&cgroup_mutex));
 503
 504         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
 505                 cnt = bpf_prog_array_length(effective);
 506         else
 507                 cnt = prog_list_length(progs);
 508
 509         if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 510                 return -EFAULT;
 511         if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
 512                 return -EFAULT;
 513         if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
 514                 /* return early if user requested only program count + flags */
 515                 return 0;
 516         if (attr->query.prog_cnt < cnt) {
 517                 cnt = attr->query.prog_cnt;
 518                 ret = -ENOSPC;
 519         }
 520
 521         if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
 522                 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
 523         } else {
 524                 struct bpf_prog_list *pl;
 525                 u32 id;
 526
 527                 i = 0;
 528                 list_for_each_entry(pl, progs, node) {
 529                         id = pl->prog->aux->id;
 530                         if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 531                                 return -EFAULT;
 532                         if (++i == cnt)
 533                                 break;
 534                 }
 535         }
 536         return ret;
 537 }
 538
 539 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 540                            enum bpf_prog_type ptype, struct bpf_prog *prog)
 541 {
 542         struct cgroup *cgrp;
 543         int ret;
 544
 545         cgrp = cgroup_get_from_fd(attr->target_fd);
 546         if (IS_ERR(cgrp))
 547                 return PTR_ERR(cgrp);
 548
 549         ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
 550                                 attr->attach_flags);
 551         cgroup_put(cgrp);
 552         return ret;
 553 }
 554
 555 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 556 {
 557         struct bpf_prog *prog;
 558         struct cgroup *cgrp;
 559         int ret;
 560
 561         cgrp = cgroup_get_from_fd(attr->target_fd);
 562         if (IS_ERR(cgrp))
 563                 return PTR_ERR(cgrp);
 564
 565         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 566         if (IS_ERR(prog))
 567                 prog = NULL;
 568
 569         ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
 570         if (prog)
 571                 bpf_prog_put(prog);
 572
 573         cgroup_put(cgrp);
 574         return ret;
 575 }
 576
 577 int cgroup_bpf_prog_query(const union bpf_attr *attr,
 578                           union bpf_attr __user *uattr)
 579 {
 580         struct cgroup *cgrp;
 581         int ret;
 582
 583         cgrp = cgroup_get_from_fd(attr->query.target_fd);
 584         if (IS_ERR(cgrp))
 585                 return PTR_ERR(cgrp);
 586
 587         ret = cgroup_bpf_query(cgrp, attr, uattr);
 588
 589         cgroup_put(cgrp);
 590         return ret;
 591 }
 592
 593 /**
 594  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 595  * @sk: The socket sending or receiving traffic
 596  * @skb: The skb that is being sent or received
 597  * @type: The type of program to be exectuted
 598  *
 599  * If no socket is passed, or the socket is not of type INET or INET6,
 600  * this function does nothing and returns 0.
 601  *
 602  * The program type passed in via @type must be suitable for network
 603  * filtering. No further check is performed to assert that.
 604  *
 605  * For egress packets, this function can return:
 606  *   NET_XMIT_SUCCESS    (0)    - continue with packet output
 607  *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
 608  *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
 609  *                                to call cwr
 610  *   -EPERM                     - drop packet
 611  *
 612  * For ingress packets, this function will return -EPERM if any
 613  * attached program was found and if it returned != 1 during execution.
 614  * Otherwise 0 is returned.
 615  */
 616 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 617                                 struct sk_buff *skb,
 618                                 enum bpf_attach_type type)
 619 {
 620         unsigned int offset = skb->data - skb_network_header(skb);
 621         struct sock *save_sk;
 622         void *saved_data_end;
 623         struct cgroup *cgrp;
 624         int ret;
 625
 626         if (!sk || !sk_fullsock(sk))
 627                 return 0;
 628
 629         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 630                 return 0;
 631
 632         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 633         save_sk = skb->sk;
 634         skb->sk = sk;
 635         __skb_push(skb, offset);
 636
 637         /* compute pointers for the bpf prog */
 638         bpf_compute_and_save_data_end(skb, &saved_data_end);
 639
 640         if (type == BPF_CGROUP_INET_EGRESS) {
 641                 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
 642                         cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
 643         } else {
 644                 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 645                                           __bpf_prog_run_save_cb);
 646                 ret = (ret == 1 ? 0 : -EPERM);
 647         }
 648         bpf_restore_data_end(skb, saved_data_end);
 649         __skb_pull(skb, offset);
 650         skb->sk = save_sk;
 651
 652         return ret;
 653 }
 654 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 655
 656 /**
 657  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 658  * @sk: sock structure to manipulate
 659  * @type: The type of program to be exectuted
 660  *
 661  * socket is passed is expected to be of type INET or INET6.
 662  *
 663  * The program type passed in via @type must be suitable for sock
 664  * filtering. No further check is performed to assert that.
 665  *
 666  * This function will return %-EPERM if any if an attached program was found
 667  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 668  */
 669 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 670                                enum bpf_attach_type type)
 671 {
 672         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 673         int ret;
 674
 675         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
 676         return ret == 1 ? 0 : -EPERM;
 677 }
 678 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 679
 680 /**
 681  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
 682  *                                       provided by user sockaddr
 683  * @sk: sock struct that will use sockaddr
 684  * @uaddr: sockaddr struct provided by user
 685  * @type: The type of program to be exectuted
 686  * @t_ctx: Pointer to attach type specific context
 687  *
 688  * socket is expected to be of type INET or INET6.
 689  *
 690  * This function will return %-EPERM if an attached program is found and
 691  * returned value != 1 during execution. In all other cases, 0 is returned.
 692  */
 693 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 694                                       struct sockaddr *uaddr,
 695                                       enum bpf_attach_type type,
 696                                       void *t_ctx)
 697 {
 698         struct bpf_sock_addr_kern ctx = {
 699                 .sk = sk,
 700                 .uaddr = uaddr,
 701                 .t_ctx = t_ctx,
 702         };
 703         struct sockaddr_storage unspec;
 704         struct cgroup *cgrp;
 705         int ret;
 706
 707         /* Check socket family since not all sockets represent network
 708          * endpoint (e.g. AF_UNIX).
 709          */
 710         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 711                 return 0;
 712
 713         if (!ctx.uaddr) {
 714                 memset(&unspec, 0, sizeof(unspec));
 715                 ctx.uaddr = (struct sockaddr *)&unspec;
 716         }
 717
 718         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 719         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 720
 721         return ret == 1 ? 0 : -EPERM;
 722 }
 723 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 724
 725 /**
 726  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 727  * @sk: socket to get cgroup from
 728  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 729  * sk with connection information (IP addresses, etc.) May not contain
 730  * cgroup info if it is a req sock.
 731  * @type: The type of program to be exectuted
 732  *
 733  * socket passed is expected to be of type INET or INET6.
 734  *
 735  * The program type passed in via @type must be suitable for sock_ops
 736  * filtering. No further check is performed to assert that.
 737  *
 738  * This function will return %-EPERM if any if an attached program was found
 739  * and if it returned != 1 during execution. In all other cases, 0 is returned.
 740  */
 741 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 742                                      struct bpf_sock_ops_kern *sock_ops,
 743                                      enum bpf_attach_type type)
 744 {
 745         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 746         int ret;
 747
 748         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
 749                                  BPF_PROG_RUN);
 750         return ret == 1 ? 0 : -EPERM;
 751 }
 752 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
 753
 754 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 755                                       short access, enum bpf_attach_type type)
 756 {
 757         struct cgroup *cgrp;
 758         struct bpf_cgroup_dev_ctx ctx = {
 759                 .access_type = (access << 16) | dev_type,
 760                 .major = major,
 761                 .minor = minor,
 762         };
 763         int allow = 1;
 764
 765         rcu_read_lock();
 766         cgrp = task_dfl_cgroup(current);
 767         allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
 768                                    BPF_PROG_RUN);
 769         rcu_read_unlock();
 770
 771         return !allow;
 772 }
 773 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 774
 775 static const struct bpf_func_proto *
 776 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 777 {
 778         switch (func_id) {
 779         case BPF_FUNC_map_lookup_elem:
 780                 return &bpf_map_lookup_elem_proto;
 781         case BPF_FUNC_map_update_elem:
 782                 return &bpf_map_update_elem_proto;
 783         case BPF_FUNC_map_delete_elem:
 784                 return &bpf_map_delete_elem_proto;
 785         case BPF_FUNC_map_push_elem:
 786                 return &bpf_map_push_elem_proto;
 787         case BPF_FUNC_map_pop_elem:
 788                 return &bpf_map_pop_elem_proto;
 789         case BPF_FUNC_map_peek_elem:
 790                 return &bpf_map_peek_elem_proto;
 791         case BPF_FUNC_get_current_uid_gid:
 792                 return &bpf_get_current_uid_gid_proto;
 793         case BPF_FUNC_get_local_storage:
 794                 return &bpf_get_local_storage_proto;
 795         case BPF_FUNC_get_current_cgroup_id:
 796                 return &bpf_get_current_cgroup_id_proto;
 797         case BPF_FUNC_trace_printk:
 798                 if (capable(CAP_SYS_ADMIN))
 799                         return bpf_get_trace_printk_proto();
 800                 /* fall through */
 801         default:
 802                 return NULL;
 803         }
 804 }
 805
 806 static const struct bpf_func_proto *
 807 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 808 {
 809         return cgroup_base_func_proto(func_id, prog);
 810 }
 811
 812 static bool cgroup_dev_is_valid_access(int off, int size,
 813                                        enum bpf_access_type type,
 814                                        const struct bpf_prog *prog,
 815                                        struct bpf_insn_access_aux *info)
 816 {
 817         const int size_default = sizeof(__u32);
 818
 819         if (type == BPF_WRITE)
 820                 return false;
 821
 822         if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
 823                 return false;
 824         /* The verifier guarantees that size > 0. */
 825         if (off % size != 0)
 826                 return false;
 827
 828         switch (off) {
 829         case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
 830                 bpf_ctx_record_field_size(info, size_default);
 831                 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 832                         return false;
 833                 break;
 834         default:
 835                 if (size != size_default)
 836                         return false;
 837         }
 838
 839         return true;
 840 }
 841
 842 const struct bpf_prog_ops cg_dev_prog_ops = {
 843 };
 844
 845 const struct bpf_verifier_ops cg_dev_verifier_ops = {
 846         .get_func_proto         = cgroup_dev_func_proto,
 847         .is_valid_access        = cgroup_dev_is_valid_access,
 848 };
 849
 850 /**
 851  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
 852  *
 853  * @head: sysctl table header
 854  * @table: sysctl table
 855  * @write: sysctl is being read (= 0) or written (= 1)
 856  * @buf: pointer to buffer passed by user space
 857  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
 858  *      result is size of @new_buf if program set new value, initial value
 859  *      otherwise
 860  * @ppos: value-result argument: value is position at which read from or write
 861  *      to sysctl is happening, result is new position if program overrode it,
 862  *      initial value otherwise
 863  * @new_buf: pointer to pointer to new buffer that will be allocated if program
 864  *      overrides new value provided by user space on sysctl write
 865  *      NOTE: it's caller responsibility to free *new_buf if it was set
 866  * @type: type of program to be executed
 867  *
 868  * Program is run when sysctl is being accessed, either read or written, and
 869  * can allow or deny such access.
 870  *
 871  * This function will return %-EPERM if an attached program is found and
 872  * returned value != 1 during execution. In all other cases 0 is returned.
 873  */
 874 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 875                                    struct ctl_table *table, int write,
 876                                    void __user *buf, size_t *pcount,
 877                                    loff_t *ppos, void **new_buf,
 878                                    enum bpf_attach_type type)
 879 {
 880         struct bpf_sysctl_kern ctx = {
 881                 .head = head,
 882                 .table = table,
 883                 .write = write,
 884                 .ppos = ppos,
 885                 .cur_val = NULL,
 886                 .cur_len = PAGE_SIZE,
 887                 .new_val = NULL,
 888                 .new_len = 0,
 889                 .new_updated = 0,
 890         };
 891         struct cgroup *cgrp;
 892         int ret;
 893
 894         ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
 895         if (ctx.cur_val) {
 896                 mm_segment_t old_fs;
 897                 loff_t pos = 0;
 898
 899                 old_fs = get_fs();
 900                 set_fs(KERNEL_DS);
 901                 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
 902                                         &ctx.cur_len, &pos)) {
 903                         /* Let BPF program decide how to proceed. */
 904                         ctx.cur_len = 0;
 905                 }
 906                 set_fs(old_fs);
 907         } else {
 908                 /* Let BPF program decide how to proceed. */
 909                 ctx.cur_len = 0;
 910         }
 911
 912         if (write && buf && *pcount) {
 913                 /* BPF program should be able to override new value with a
 914                  * buffer bigger than provided by user.
 915                  */
 916                 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
 917                 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
 918                 if (!ctx.new_val ||
 919                     copy_from_user(ctx.new_val, buf, ctx.new_len))
 920                         /* Let BPF program decide how to proceed. */
 921                         ctx.new_len = 0;
 922         }
 923
 924         rcu_read_lock();
 925         cgrp = task_dfl_cgroup(current);
 926         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 927         rcu_read_unlock();
 928
 929         kfree(ctx.cur_val);
 930
 931         if (ret == 1 && ctx.new_updated) {
 932                 *new_buf = ctx.new_val;
 933                 *pcount = ctx.new_len;
 934         } else {
 935                 kfree(ctx.new_val);
 936         }
 937
 938         return ret == 1 ? 0 : -EPERM;
 939 }
 940 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
 941
 942 #ifdef CONFIG_NET
 943 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
 944                                              enum bpf_attach_type attach_type)
 945 {
 946         struct bpf_prog_array *prog_array;
 947         bool empty;
 948
 949         rcu_read_lock();
 950         prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
 951         empty = bpf_prog_array_is_empty(prog_array);
 952         rcu_read_unlock();
 953
 954         return empty;
 955 }
 956
 957 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
 958 {
 959         if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
 960                 return -EINVAL;
 961
 962         ctx->optval = kzalloc(max_optlen, GFP_USER);
 963         if (!ctx->optval)
 964                 return -ENOMEM;
 965
 966         ctx->optval_end = ctx->optval + max_optlen;
 967         ctx->optlen = max_optlen;
 968
 969         return 0;
 970 }
 971
 972 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
 973 {
 974         kfree(ctx->optval);
 975 }
 976
 977 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 978                                        int *optname, char __user *optval,
 979                                        int *optlen, char **kernel_optval)
 980 {
 981         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 982         struct bpf_sockopt_kern ctx = {
 983                 .sk = sk,
 984                 .level = *level,
 985                 .optname = *optname,
 986         };
 987         int ret;
 988
 989         /* Opportunistic check to see whether we have any BPF program
 990          * attached to the hook so we don't waste time allocating
 991          * memory and locking the socket.
 992          */
 993         if (!cgroup_bpf_enabled ||
 994             __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
 995                 return 0;
 996
 997         ret = sockopt_alloc_buf(&ctx, *optlen);
 998         if (ret)
 999                 return ret;
1000
1001         if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1002                 ret = -EFAULT;
1003                 goto out;
1004         }
1005
1006         lock_sock(sk);
1007         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1008                                  &ctx, BPF_PROG_RUN);
1009         release_sock(sk);
1010
1011         if (!ret) {
1012                 ret = -EPERM;
1013                 goto out;
1014         }
1015
1016         if (ctx.optlen == -1) {
1017                 /* optlen set to -1, bypass kernel */
1018                 ret = 1;
1019         } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
1020                 /* optlen is out of bounds */
1021                 ret = -EFAULT;
1022         } else {
1023                 /* optlen within bounds, run kernel handler */
1024                 ret = 0;
1025
1026                 /* export any potential modifications */
1027                 *level = ctx.level;
1028                 *optname = ctx.optname;
1029                 *optlen = ctx.optlen;
1030                 *kernel_optval = ctx.optval;
1031         }
1032
1033 out:
1034         if (ret)
1035                 sockopt_free_buf(&ctx);
1036         return ret;
1037 }
1038 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1039
1040 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1041                                        int optname, char __user *optval,
1042                                        int __user *optlen, int max_optlen,
1043                                        int retval)
1044 {
1045         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1046         struct bpf_sockopt_kern ctx = {
1047                 .sk = sk,
1048                 .level = level,
1049                 .optname = optname,
1050                 .retval = retval,
1051         };
1052         int ret;
1053
1054         /* Opportunistic check to see whether we have any BPF program
1055          * attached to the hook so we don't waste time allocating
1056          * memory and locking the socket.
1057          */
1058         if (!cgroup_bpf_enabled ||
1059             __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1060                 return retval;
1061
1062         ret = sockopt_alloc_buf(&ctx, max_optlen);
1063         if (ret)
1064                 return ret;
1065
1066         if (!retval) {
1067                 /* If kernel getsockopt finished successfully,
1068                  * copy whatever was returned to the user back
1069                  * into our temporary buffer. Set optlen to the
1070                  * one that kernel returned as well to let
1071                  * BPF programs inspect the value.
1072                  */
1073
1074                 if (get_user(ctx.optlen, optlen)) {
1075                         ret = -EFAULT;
1076                         goto out;
1077                 }
1078
1079                 if (ctx.optlen > max_optlen)
1080                         ctx.optlen = max_optlen;
1081
1082                 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1083                         ret = -EFAULT;
1084                         goto out;
1085                 }
1086         }
1087
1088         lock_sock(sk);
1089         ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1090                                  &ctx, BPF_PROG_RUN);
1091         release_sock(sk);
1092
1093         if (!ret) {
1094                 ret = -EPERM;
1095                 goto out;
1096         }
1097
1098         if (ctx.optlen > max_optlen) {
1099                 ret = -EFAULT;
1100                 goto out;
1101         }
1102
1103         /* BPF programs only allowed to set retval to 0, not some
1104          * arbitrary value.
1105          */
1106         if (ctx.retval != 0 && ctx.retval != retval) {
1107                 ret = -EFAULT;
1108                 goto out;
1109         }
1110
1111         if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1112             put_user(ctx.optlen, optlen)) {
1113                 ret = -EFAULT;
1114                 goto out;
1115         }
1116
1117         ret = ctx.retval;
1118
1119 out:
1120         sockopt_free_buf(&ctx);
1121         return ret;
1122 }
1123 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1124 #endif
1125
1126 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1127                               size_t *lenp)
1128 {
1129         ssize_t tmp_ret = 0, ret;
1130
1131         if (dir->header.parent) {
1132                 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1133                 if (tmp_ret < 0)
1134                         return tmp_ret;
1135         }
1136
1137         ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1138         if (ret < 0)
1139                 return ret;
1140         *bufp += ret;
1141         *lenp -= ret;
1142         ret += tmp_ret;
1143
1144         /* Avoid leading slash. */
1145         if (!ret)
1146                 return ret;
1147
1148         tmp_ret = strscpy(*bufp, "/", *lenp);
1149         if (tmp_ret < 0)
1150                 return tmp_ret;
1151         *bufp += tmp_ret;
1152         *lenp -= tmp_ret;
1153
1154         return ret + tmp_ret;
1155 }
1156
1157 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1158            size_t, buf_len, u64, flags)
1159 {
1160         ssize_t tmp_ret = 0, ret;
1161
1162         if (!buf)
1163                 return -EINVAL;
1164
1165         if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1166                 if (!ctx->head)
1167                         return -EINVAL;
1168                 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1169                 if (tmp_ret < 0)
1170                         return tmp_ret;
1171         }
1172
1173         ret = strscpy(buf, ctx->table->procname, buf_len);
1174
1175         return ret < 0 ? ret : tmp_ret + ret;
1176 }
1177
1178 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1179         .func           = bpf_sysctl_get_name,
1180         .gpl_only       = false,
1181         .ret_type       = RET_INTEGER,
1182         .arg1_type      = ARG_PTR_TO_CTX,
1183         .arg2_type      = ARG_PTR_TO_MEM,
1184         .arg3_type      = ARG_CONST_SIZE,
1185         .arg4_type      = ARG_ANYTHING,
1186 };
1187
1188 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1189                              size_t src_len)
1190 {
1191         if (!dst)
1192                 return -EINVAL;
1193
1194         if (!dst_len)
1195                 return -E2BIG;
1196
1197         if (!src || !src_len) {
1198                 memset(dst, 0, dst_len);
1199                 return -EINVAL;
1200         }
1201
1202         memcpy(dst, src, min(dst_len, src_len));
1203
1204         if (dst_len > src_len) {
1205                 memset(dst + src_len, '\0', dst_len - src_len);
1206                 return src_len;
1207         }
1208
1209         dst[dst_len - 1] = '\0';
1210
1211         return -E2BIG;
1212 }
1213
1214 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1215            char *, buf, size_t, buf_len)
1216 {
1217         return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1218 }
1219
1220 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1221         .func           = bpf_sysctl_get_current_value,
1222         .gpl_only       = false,
1223         .ret_type       = RET_INTEGER,
1224         .arg1_type      = ARG_PTR_TO_CTX,
1225         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1226         .arg3_type      = ARG_CONST_SIZE,
1227 };
1228
1229 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1230            size_t, buf_len)
1231 {
1232         if (!ctx->write) {
1233                 if (buf && buf_len)
1234                         memset(buf, '\0', buf_len);
1235                 return -EINVAL;
1236         }
1237         return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1238 }
1239
1240 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1241         .func           = bpf_sysctl_get_new_value,
1242         .gpl_only       = false,
1243         .ret_type       = RET_INTEGER,
1244         .arg1_type      = ARG_PTR_TO_CTX,
1245         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
1246         .arg3_type      = ARG_CONST_SIZE,
1247 };
1248
1249 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1250            const char *, buf, size_t, buf_len)
1251 {
1252         if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1253                 return -EINVAL;
1254
1255         if (buf_len > PAGE_SIZE - 1)
1256                 return -E2BIG;
1257
1258         memcpy(ctx->new_val, buf, buf_len);
1259         ctx->new_len = buf_len;
1260         ctx->new_updated = 1;
1261
1262         return 0;
1263 }
1264
1265 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1266         .func           = bpf_sysctl_set_new_value,
1267         .gpl_only       = false,
1268         .ret_type       = RET_INTEGER,
1269         .arg1_type      = ARG_PTR_TO_CTX,
1270         .arg2_type      = ARG_PTR_TO_MEM,
1271         .arg3_type      = ARG_CONST_SIZE,
1272 };
1273
1274 static const struct bpf_func_proto *
1275 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1276 {
1277         switch (func_id) {
1278         case BPF_FUNC_strtol:
1279                 return &bpf_strtol_proto;
1280         case BPF_FUNC_strtoul:
1281                 return &bpf_strtoul_proto;
1282         case BPF_FUNC_sysctl_get_name:
1283                 return &bpf_sysctl_get_name_proto;
1284         case BPF_FUNC_sysctl_get_current_value:
1285                 return &bpf_sysctl_get_current_value_proto;
1286         case BPF_FUNC_sysctl_get_new_value:
1287                 return &bpf_sysctl_get_new_value_proto;
1288         case BPF_FUNC_sysctl_set_new_value:
1289                 return &bpf_sysctl_set_new_value_proto;
1290         default:
1291                 return cgroup_base_func_proto(func_id, prog);
1292         }
1293 }
1294
1295 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1296                                    const struct bpf_prog *prog,
1297                                    struct bpf_insn_access_aux *info)
1298 {
1299         const int size_default = sizeof(__u32);
1300
1301         if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1302                 return false;
1303
1304         switch (off) {
1305         case offsetof(struct bpf_sysctl, write):
1306                 if (type != BPF_READ)
1307                         return false;
1308                 bpf_ctx_record_field_size(info, size_default);
1309                 return bpf_ctx_narrow_access_ok(off, size, size_default);
1310         case offsetof(struct bpf_sysctl, file_pos):
1311                 if (type == BPF_READ) {
1312                         bpf_ctx_record_field_size(info, size_default);
1313                         return bpf_ctx_narrow_access_ok(off, size, size_default);
1314                 } else {
1315                         return size == size_default;
1316                 }
1317         default:
1318                 return false;
1319         }
1320 }
1321
1322 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1323                                      const struct bpf_insn *si,
1324                                      struct bpf_insn *insn_buf,
1325                                      struct bpf_prog *prog, u32 *target_size)
1326 {
1327         struct bpf_insn *insn = insn_buf;
1328
1329         switch (si->off) {
1330         case offsetof(struct bpf_sysctl, write):
1331                 *insn++ = BPF_LDX_MEM(
1332                         BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1333                         bpf_target_off(struct bpf_sysctl_kern, write,
1334                                        FIELD_SIZEOF(struct bpf_sysctl_kern,
1335                                                     write),
1336                                        target_size));
1337                 break;
1338         case offsetof(struct bpf_sysctl, file_pos):
1339                 /* ppos is a pointer so it should be accessed via indirect
1340                  * loads and stores. Also for stores additional temporary
1341                  * register is used since neither src_reg nor dst_reg can be
1342                  * overridden.
1343                  */
1344                 if (type == BPF_WRITE) {
1345                         int treg = BPF_REG_9;
1346
1347                         if (si->src_reg == treg || si->dst_reg == treg)
1348                                 --treg;
1349                         if (si->src_reg == treg || si->dst_reg == treg)
1350                                 --treg;
1351                         *insn++ = BPF_STX_MEM(
1352                                 BPF_DW, si->dst_reg, treg,
1353                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
1354                         *insn++ = BPF_LDX_MEM(
1355                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1356                                 treg, si->dst_reg,
1357                                 offsetof(struct bpf_sysctl_kern, ppos));
1358                         *insn++ = BPF_STX_MEM(
1359                                 BPF_SIZEOF(u32), treg, si->src_reg, 0);
1360                         *insn++ = BPF_LDX_MEM(
1361                                 BPF_DW, treg, si->dst_reg,
1362                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
1363                 } else {
1364                         *insn++ = BPF_LDX_MEM(
1365                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1366                                 si->dst_reg, si->src_reg,
1367                                 offsetof(struct bpf_sysctl_kern, ppos));
1368                         *insn++ = BPF_LDX_MEM(
1369                                 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
1370                 }
1371                 *target_size = sizeof(u32);
1372                 break;
1373         }
1374
1375         return insn - insn_buf;
1376 }
1377
1378 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1379         .get_func_proto         = sysctl_func_proto,
1380         .is_valid_access        = sysctl_is_valid_access,
1381         .convert_ctx_access     = sysctl_convert_ctx_access,
1382 };
1383
1384 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1385 };
1386
1387 static const struct bpf_func_proto *
1388 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1389 {
1390         switch (func_id) {
1391 #ifdef CONFIG_NET
1392         case BPF_FUNC_sk_storage_get:
1393                 return &bpf_sk_storage_get_proto;
1394         case BPF_FUNC_sk_storage_delete:
1395                 return &bpf_sk_storage_delete_proto;
1396 #endif
1397 #ifdef CONFIG_INET
1398         case BPF_FUNC_tcp_sock:
1399                 return &bpf_tcp_sock_proto;
1400 #endif
1401         default:
1402                 return cgroup_base_func_proto(func_id, prog);
1403         }
1404 }
1405
1406 static bool cg_sockopt_is_valid_access(int off, int size,
1407                                        enum bpf_access_type type,
1408                                        const struct bpf_prog *prog,
1409                                        struct bpf_insn_access_aux *info)
1410 {
1411         const int size_default = sizeof(__u32);
1412
1413         if (off < 0 || off >= sizeof(struct bpf_sockopt))
1414                 return false;
1415
1416         if (off % size != 0)
1417                 return false;
1418
1419         if (type == BPF_WRITE) {
1420                 switch (off) {
1421                 case offsetof(struct bpf_sockopt, retval):
1422                         if (size != size_default)
1423                                 return false;
1424                         return prog->expected_attach_type ==
1425                                 BPF_CGROUP_GETSOCKOPT;
1426                 case offsetof(struct bpf_sockopt, optname):
1427                         /* fallthrough */
1428                 case offsetof(struct bpf_sockopt, level):
1429                         if (size != size_default)
1430                                 return false;
1431                         return prog->expected_attach_type ==
1432                                 BPF_CGROUP_SETSOCKOPT;
1433                 case offsetof(struct bpf_sockopt, optlen):
1434                         return size == size_default;
1435                 default:
1436                         return false;
1437                 }
1438         }
1439
1440         switch (off) {
1441         case offsetof(struct bpf_sockopt, sk):
1442                 if (size != sizeof(__u64))
1443                         return false;
1444                 info->reg_type = PTR_TO_SOCKET;
1445                 break;
1446         case offsetof(struct bpf_sockopt, optval):
1447                 if (size != sizeof(__u64))
1448                         return false;
1449                 info->reg_type = PTR_TO_PACKET;
1450                 break;
1451         case offsetof(struct bpf_sockopt, optval_end):
1452                 if (size != sizeof(__u64))
1453                         return false;
1454                 info->reg_type = PTR_TO_PACKET_END;
1455                 break;
1456         case offsetof(struct bpf_sockopt, retval):
1457                 if (size != size_default)
1458                         return false;
1459                 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1460         default:
1461                 if (size != size_default)
1462                         return false;
1463                 break;
1464         }
1465         return true;
1466 }
1467
1468 #define CG_SOCKOPT_ACCESS_FIELD(T, F)                                   \
1469         T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),                 \
1470           si->dst_reg, si->src_reg,                                     \
1471           offsetof(struct bpf_sockopt_kern, F))
1472
1473 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1474                                          const struct bpf_insn *si,
1475                                          struct bpf_insn *insn_buf,
1476                                          struct bpf_prog *prog,
1477                                          u32 *target_size)
1478 {
1479         struct bpf_insn *insn = insn_buf;
1480
1481         switch (si->off) {
1482         case offsetof(struct bpf_sockopt, sk):
1483                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1484                 break;
1485         case offsetof(struct bpf_sockopt, level):
1486                 if (type == BPF_WRITE)
1487                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1488                 else
1489                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1490                 break;
1491         case offsetof(struct bpf_sockopt, optname):
1492                 if (type == BPF_WRITE)
1493                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1494                 else
1495                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1496                 break;
1497         case offsetof(struct bpf_sockopt, optlen):
1498                 if (type == BPF_WRITE)
1499                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1500                 else
1501                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1502                 break;
1503         case offsetof(struct bpf_sockopt, retval):
1504                 if (type == BPF_WRITE)
1505                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1506                 else
1507                         *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1508                 break;
1509         case offsetof(struct bpf_sockopt, optval):
1510                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1511                 break;
1512         case offsetof(struct bpf_sockopt, optval_end):
1513                 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1514                 break;
1515         }
1516
1517         return insn - insn_buf;
1518 }
1519
1520 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1521                                    bool direct_write,
1522                                    const struct bpf_prog *prog)
1523 {
1524         /* Nothing to do for sockopt argument. The data is kzalloc'ated.
1525          */
1526         return 0;
1527 }
1528
1529 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1530         .get_func_proto         = cg_sockopt_func_proto,
1531         .is_valid_access        = cg_sockopt_is_valid_access,
1532         .convert_ctx_access     = cg_sockopt_convert_ctx_access,
1533         .gen_prologue           = cg_sockopt_get_prologue,
1534 };
1535
1536 const struct bpf_prog_ops cg_sockopt_prog_ops = {
1537 };