kernel/bpf/cgroup.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Functions to manage eBPF programs attached to cgroups
   4  *
   5  * Copyright (c) 2016 Daniel Mack
   6  */
   7
   8 #include <linux/kernel.h>
   9 #include <linux/atomic.h>
  10 #include <linux/cgroup.h>
  11 #include <linux/filter.h>
  12 #include <linux/slab.h>
  13 #include <linux/sysctl.h>
  14 #include <linux/string.h>
  15 #include <linux/bpf.h>
  16 #include <linux/bpf-cgroup.h>
  17 #include <linux/bpf_lsm.h>
  18 #include <linux/bpf_verifier.h>
  19 #include <net/sock.h>
  20 #include <net/bpf_sk_storage.h>
  21
  22 #include "../cgroup/cgroup-internal.h"
  23
  24 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
  25 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
  26
  27 /*
  28  * cgroup bpf destruction makes heavy use of work items and there can be a lot
  29  * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
  30  * destruction work items don't end up filling up max_active of system_wq
  31  * which may lead to deadlock.
  32  */
  33 static struct workqueue_struct *cgroup_bpf_destroy_wq;
  34
  35 static int __init cgroup_bpf_wq_init(void)
  36 {
  37         cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
  38         if (!cgroup_bpf_destroy_wq)
  39                 panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
  40         return 0;
  41 }
  42 core_initcall(cgroup_bpf_wq_init);
  43
  44 /* __always_inline is necessary to prevent indirect call through run_prog
  45  * function pointer.
  46  */
  47 static __always_inline int
  48 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
  49                       enum cgroup_bpf_attach_type atype,
  50                       const void *ctx, bpf_prog_run_fn run_prog,
  51                       int retval, u32 *ret_flags)
  52 {
  53         const struct bpf_prog_array_item *item;
  54         const struct bpf_prog *prog;
  55         const struct bpf_prog_array *array;
  56         struct bpf_run_ctx *old_run_ctx;
  57         struct bpf_cg_run_ctx run_ctx;
  58         u32 func_ret;
  59
  60         run_ctx.retval = retval;
  61         migrate_disable();
  62         rcu_read_lock();
  63         array = rcu_dereference(cgrp->effective[atype]);
  64         item = &array->items[0];
  65         old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
  66         while ((prog = READ_ONCE(item->prog))) {
  67                 run_ctx.prog_item = item;
  68                 func_ret = run_prog(prog, ctx);
  69                 if (ret_flags) {
  70                         *(ret_flags) |= (func_ret >> 1);
  71                         func_ret &= 1;
  72                 }
  73                 if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
  74                         run_ctx.retval = -EPERM;
  75                 item++;
  76         }
  77         bpf_reset_run_ctx(old_run_ctx);
  78         rcu_read_unlock();
  79         migrate_enable();
  80         return run_ctx.retval;
  81 }
  82
  83 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
  84                                        const struct bpf_insn *insn)
  85 {
  86         const struct bpf_prog *shim_prog;
  87         struct sock *sk;
  88         struct cgroup *cgrp;
  89         int ret = 0;
  90         u64 *args;
  91
  92         args = (u64 *)ctx;
  93         sk = (void *)(unsigned long)args[0];
  94         /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
  95         shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
  96
  97         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
  98         if (likely(cgrp))
  99                 ret = bpf_prog_run_array_cg(&cgrp->bpf,
 100                                             shim_prog->aux->cgroup_atype,
 101                                             ctx, bpf_prog_run, 0, NULL);
 102         return ret;
 103 }
 104
 105 unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
 106                                          const struct bpf_insn *insn)
 107 {
 108         const struct bpf_prog *shim_prog;
 109         struct socket *sock;
 110         struct cgroup *cgrp;
 111         int ret = 0;
 112         u64 *args;
 113
 114         args = (u64 *)ctx;
 115         sock = (void *)(unsigned long)args[0];
 116         /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
 117         shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
 118
 119         cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
 120         if (likely(cgrp))
 121                 ret = bpf_prog_run_array_cg(&cgrp->bpf,
 122                                             shim_prog->aux->cgroup_atype,
 123                                             ctx, bpf_prog_run, 0, NULL);
 124         return ret;
 125 }
 126
 127 unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
 128                                           const struct bpf_insn *insn)
 129 {
 130         const struct bpf_prog *shim_prog;
 131         struct cgroup *cgrp;
 132         int ret = 0;
 133
 134         /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
 135         shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
 136
 137         /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
 138         cgrp = task_dfl_cgroup(current);
 139         if (likely(cgrp))
 140                 ret = bpf_prog_run_array_cg(&cgrp->bpf,
 141                                             shim_prog->aux->cgroup_atype,
 142                                             ctx, bpf_prog_run, 0, NULL);
 143         return ret;
 144 }
 145
 146 #ifdef CONFIG_BPF_LSM
 147 struct cgroup_lsm_atype {
 148         u32 attach_btf_id;
 149         int refcnt;
 150 };
 151
 152 static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
 153
 154 static enum cgroup_bpf_attach_type
 155 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 156 {
 157         int i;
 158
 159         lockdep_assert_held(&cgroup_mutex);
 160
 161         if (attach_type != BPF_LSM_CGROUP)
 162                 return to_cgroup_bpf_attach_type(attach_type);
 163
 164         for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
 165                 if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
 166                         return CGROUP_LSM_START + i;
 167
 168         for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
 169                 if (cgroup_lsm_atype[i].attach_btf_id == 0)
 170                         return CGROUP_LSM_START + i;
 171
 172         return -E2BIG;
 173
 174 }
 175
 176 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
 177 {
 178         int i = cgroup_atype - CGROUP_LSM_START;
 179
 180         lockdep_assert_held(&cgroup_mutex);
 181
 182         WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
 183                      cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
 184
 185         cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
 186         cgroup_lsm_atype[i].refcnt++;
 187 }
 188
 189 void bpf_cgroup_atype_put(int cgroup_atype)
 190 {
 191         int i = cgroup_atype - CGROUP_LSM_START;
 192
 193         cgroup_lock();
 194         if (--cgroup_lsm_atype[i].refcnt <= 0)
 195                 cgroup_lsm_atype[i].attach_btf_id = 0;
 196         WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
 197         cgroup_unlock();
 198 }
 199 #else
 200 static enum cgroup_bpf_attach_type
 201 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
 202 {
 203         if (attach_type != BPF_LSM_CGROUP)
 204                 return to_cgroup_bpf_attach_type(attach_type);
 205         return -EOPNOTSUPP;
 206 }
 207 #endif /* CONFIG_BPF_LSM */
 208
 209 void cgroup_bpf_offline(struct cgroup *cgrp)
 210 {
 211         cgroup_get(cgrp);
 212         percpu_ref_kill(&cgrp->bpf.refcnt);
 213 }
 214
 215 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
 216 {
 217         enum bpf_cgroup_storage_type stype;
 218
 219         for_each_cgroup_storage_type(stype)
 220                 bpf_cgroup_storage_free(storages[stype]);
 221 }
 222
 223 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
 224                                      struct bpf_cgroup_storage *new_storages[],
 225                                      enum bpf_attach_type type,
 226                                      struct bpf_prog *prog,
 227                                      struct cgroup *cgrp)
 228 {
 229         enum bpf_cgroup_storage_type stype;
 230         struct bpf_cgroup_storage_key key;
 231         struct bpf_map *map;
 232
 233         key.cgroup_inode_id = cgroup_id(cgrp);
 234         key.attach_type = type;
 235
 236         for_each_cgroup_storage_type(stype) {
 237                 map = prog->aux->cgroup_storage[stype];
 238                 if (!map)
 239                         continue;
 240
 241                 storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
 242                 if (storages[stype])
 243                         continue;
 244
 245                 storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
 246                 if (IS_ERR(storages[stype])) {
 247                         bpf_cgroup_storages_free(new_storages);
 248                         return -ENOMEM;
 249                 }
 250
 251                 new_storages[stype] = storages[stype];
 252         }
 253
 254         return 0;
 255 }
 256
 257 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
 258                                        struct bpf_cgroup_storage *src[])
 259 {
 260         enum bpf_cgroup_storage_type stype;
 261
 262         for_each_cgroup_storage_type(stype)
 263                 dst[stype] = src[stype];
 264 }
 265
 266 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
 267                                      struct cgroup *cgrp,
 268                                      enum bpf_attach_type attach_type)
 269 {
 270         enum bpf_cgroup_storage_type stype;
 271
 272         for_each_cgroup_storage_type(stype)
 273                 bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
 274 }
 275
 276 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
 277  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
 278  * doesn't free link memory, which will eventually be done by bpf_link's
 279  * release() callback, when its last FD is closed.
 280  */
 281 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
 282 {
 283         cgroup_put(link->cgroup);
 284         link->cgroup = NULL;
 285 }
 286
 287 /**
 288  * cgroup_bpf_release() - put references of all bpf programs and
 289  *                        release all cgroup bpf data
 290  * @work: work structure embedded into the cgroup to modify
 291  */
 292 static void cgroup_bpf_release(struct work_struct *work)
 293 {
 294         struct cgroup *p, *cgrp = container_of(work, struct cgroup,
 295                                                bpf.release_work);
 296         struct bpf_prog_array *old_array;
 297         struct list_head *storages = &cgrp->bpf.storages;
 298         struct bpf_cgroup_storage *storage, *stmp;
 299
 300         unsigned int atype;
 301
 302         cgroup_lock();
 303
 304         for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
 305                 struct hlist_head *progs = &cgrp->bpf.progs[atype];
 306                 struct bpf_prog_list *pl;
 307                 struct hlist_node *pltmp;
 308
 309                 hlist_for_each_entry_safe(pl, pltmp, progs, node) {
 310                         hlist_del(&pl->node);
 311                         if (pl->prog) {
 312                                 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
 313                                         bpf_trampoline_unlink_cgroup_shim(pl->prog);
 314                                 bpf_prog_put(pl->prog);
 315                         }
 316                         if (pl->link) {
 317                                 if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
 318                                         bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
 319                                 bpf_cgroup_link_auto_detach(pl->link);
 320                         }
 321                         kfree(pl);
 322                         static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 323                 }
 324                 old_array = rcu_dereference_protected(
 325                                 cgrp->bpf.effective[atype],
 326                                 lockdep_is_held(&cgroup_mutex));
 327                 bpf_prog_array_free(old_array);
 328         }
 329
 330         list_for_each_entry_safe(storage, stmp, storages, list_cg) {
 331                 bpf_cgroup_storage_unlink(storage);
 332                 bpf_cgroup_storage_free(storage);
 333         }
 334
 335         cgroup_unlock();
 336
 337         for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 338                 cgroup_bpf_put(p);
 339
 340         percpu_ref_exit(&cgrp->bpf.refcnt);
 341         cgroup_put(cgrp);
 342 }
 343
 344 /**
 345  * cgroup_bpf_release_fn() - callback used to schedule releasing
 346  *                           of bpf cgroup data
 347  * @ref: percpu ref counter structure
 348  */
 349 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
 350 {
 351         struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
 352
 353         INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
 354         queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
 355 }
 356
 357 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
 358  * link or direct prog.
 359  */
 360 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 361 {
 362         if (pl->prog)
 363                 return pl->prog;
 364         if (pl->link)
 365                 return pl->link->link.prog;
 366         return NULL;
 367 }
 368
 369 /* count number of elements in the list.
 370  * it's slow but the list cannot be long
 371  */
 372 static u32 prog_list_length(struct hlist_head *head)
 373 {
 374         struct bpf_prog_list *pl;
 375         u32 cnt = 0;
 376
 377         hlist_for_each_entry(pl, head, node) {
 378                 if (!prog_list_prog(pl))
 379                         continue;
 380                 cnt++;
 381         }
 382         return cnt;
 383 }
 384
 385 /* if parent has non-overridable prog attached,
 386  * disallow attaching new programs to the descendent cgroup.
 387  * if parent has overridable or multi-prog, allow attaching
 388  */
 389 static bool hierarchy_allows_attach(struct cgroup *cgrp,
 390                                     enum cgroup_bpf_attach_type atype)
 391 {
 392         struct cgroup *p;
 393
 394         p = cgroup_parent(cgrp);
 395         if (!p)
 396                 return true;
 397         do {
 398                 u32 flags = p->bpf.flags[atype];
 399                 u32 cnt;
 400
 401                 if (flags & BPF_F_ALLOW_MULTI)
 402                         return true;
 403                 cnt = prog_list_length(&p->bpf.progs[atype]);
 404                 WARN_ON_ONCE(cnt > 1);
 405                 if (cnt == 1)
 406                         return !!(flags & BPF_F_ALLOW_OVERRIDE);
 407                 p = cgroup_parent(p);
 408         } while (p);
 409         return true;
 410 }
 411
 412 /* compute a chain of effective programs for a given cgroup:
 413  * start from the list of programs in this cgroup and add
 414  * all parent programs.
 415  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 416  * to programs in this cgroup
 417  */
 418 static int compute_effective_progs(struct cgroup *cgrp,
 419                                    enum cgroup_bpf_attach_type atype,
 420                                    struct bpf_prog_array **array)
 421 {
 422         struct bpf_prog_array_item *item;
 423         struct bpf_prog_array *progs;
 424         struct bpf_prog_list *pl;
 425         struct cgroup *p = cgrp;
 426         int cnt = 0;
 427
 428         /* count number of effective programs by walking parents */
 429         do {
 430                 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 431                         cnt += prog_list_length(&p->bpf.progs[atype]);
 432                 p = cgroup_parent(p);
 433         } while (p);
 434
 435         progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
 436         if (!progs)
 437                 return -ENOMEM;
 438
 439         /* populate the array with effective progs */
 440         cnt = 0;
 441         p = cgrp;
 442         do {
 443                 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 444                         continue;
 445
 446                 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
 447                         if (!prog_list_prog(pl))
 448                                 continue;
 449
 450                         item = &progs->items[cnt];
 451                         item->prog = prog_list_prog(pl);
 452                         bpf_cgroup_storages_assign(item->cgroup_storage,
 453                                                    pl->storage);
 454                         cnt++;
 455                 }
 456         } while ((p = cgroup_parent(p)));
 457
 458         *array = progs;
 459         return 0;
 460 }
 461
 462 static void activate_effective_progs(struct cgroup *cgrp,
 463                                      enum cgroup_bpf_attach_type atype,
 464                                      struct bpf_prog_array *old_array)
 465 {
 466         old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
 467                                         lockdep_is_held(&cgroup_mutex));
 468         /* free prog array after grace period, since __cgroup_bpf_run_*()
 469          * might be still walking the array
 470          */
 471         bpf_prog_array_free(old_array);
 472 }
 473
 474 /**
 475  * cgroup_bpf_inherit() - inherit effective programs from parent
 476  * @cgrp: the cgroup to modify
 477  */
 478 int cgroup_bpf_inherit(struct cgroup *cgrp)
 479 {
 480 /* has to use marco instead of const int, since compiler thinks
 481  * that array below is variable length
 482  */
 483 #define NR ARRAY_SIZE(cgrp->bpf.effective)
 484         struct bpf_prog_array *arrays[NR] = {};
 485         struct cgroup *p;
 486         int ret, i;
 487
 488         ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
 489                               GFP_KERNEL);
 490         if (ret)
 491                 return ret;
 492
 493         for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 494                 cgroup_bpf_get(p);
 495
 496         for (i = 0; i < NR; i++)
 497                 INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
 498
 499         INIT_LIST_HEAD(&cgrp->bpf.storages);
 500
 501         for (i = 0; i < NR; i++)
 502                 if (compute_effective_progs(cgrp, i, &arrays[i]))
 503                         goto cleanup;
 504
 505         for (i = 0; i < NR; i++)
 506                 activate_effective_progs(cgrp, i, arrays[i]);
 507
 508         return 0;
 509 cleanup:
 510         for (i = 0; i < NR; i++)
 511                 bpf_prog_array_free(arrays[i]);
 512
 513         for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 514                 cgroup_bpf_put(p);
 515
 516         percpu_ref_exit(&cgrp->bpf.refcnt);
 517
 518         return -ENOMEM;
 519 }
 520
 521 static int update_effective_progs(struct cgroup *cgrp,
 522                                   enum cgroup_bpf_attach_type atype)
 523 {
 524         struct cgroup_subsys_state *css;
 525         int err;
 526
 527         /* allocate and recompute effective prog arrays */
 528         css_for_each_descendant_pre(css, &cgrp->self) {
 529                 struct cgroup *desc = container_of(css, struct cgroup, self);
 530
 531                 if (percpu_ref_is_zero(&desc->bpf.refcnt))
 532                         continue;
 533
 534                 err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
 535                 if (err)
 536                         goto cleanup;
 537         }
 538
 539         /* all allocations were successful. Activate all prog arrays */
 540         css_for_each_descendant_pre(css, &cgrp->self) {
 541                 struct cgroup *desc = container_of(css, struct cgroup, self);
 542
 543                 if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 544                         if (unlikely(desc->bpf.inactive)) {
 545                                 bpf_prog_array_free(desc->bpf.inactive);
 546                                 desc->bpf.inactive = NULL;
 547                         }
 548                         continue;
 549                 }
 550
 551                 activate_effective_progs(desc, atype, desc->bpf.inactive);
 552                 desc->bpf.inactive = NULL;
 553         }
 554
 555         return 0;
 556
 557 cleanup:
 558         /* oom while computing effective. Free all computed effective arrays
 559          * since they were not activated
 560          */
 561         css_for_each_descendant_pre(css, &cgrp->self) {
 562                 struct cgroup *desc = container_of(css, struct cgroup, self);
 563
 564                 bpf_prog_array_free(desc->bpf.inactive);
 565                 desc->bpf.inactive = NULL;
 566         }
 567
 568         return err;
 569 }
 570
 571 #define BPF_CGROUP_MAX_PROGS 64
 572
 573 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 574                                                struct bpf_prog *prog,
 575                                                struct bpf_cgroup_link *link,
 576                                                struct bpf_prog *replace_prog,
 577                                                bool allow_multi)
 578 {
 579         struct bpf_prog_list *pl;
 580
 581         /* single-attach case */
 582         if (!allow_multi) {
 583                 if (hlist_empty(progs))
 584                         return NULL;
 585                 return hlist_entry(progs->first, typeof(*pl), node);
 586         }
 587
 588         hlist_for_each_entry(pl, progs, node) {
 589                 if (prog && pl->prog == prog && prog != replace_prog)
 590                         /* disallow attaching the same prog twice */
 591                         return ERR_PTR(-EINVAL);
 592                 if (link && pl->link == link)
 593                         /* disallow attaching the same link twice */
 594                         return ERR_PTR(-EINVAL);
 595         }
 596
 597         /* direct prog multi-attach w/ replacement case */
 598         if (replace_prog) {
 599                 hlist_for_each_entry(pl, progs, node) {
 600                         if (pl->prog == replace_prog)
 601                                 /* a match found */
 602                                 return pl;
 603                 }
 604                 /* prog to replace not found for cgroup */
 605                 return ERR_PTR(-ENOENT);
 606         }
 607
 608         return NULL;
 609 }
 610
 611 /**
 612  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
 613  *                         propagate the change to descendants
 614  * @cgrp: The cgroup which descendants to traverse
 615  * @prog: A program to attach
 616  * @link: A link to attach
 617  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
 618  * @type: Type of attach operation
 619  * @flags: Option flags
 620  *
 621  * Exactly one of @prog or @link can be non-null.
 622  * Must be called with cgroup_mutex held.
 623  */
 624 static int __cgroup_bpf_attach(struct cgroup *cgrp,
 625                                struct bpf_prog *prog, struct bpf_prog *replace_prog,
 626                                struct bpf_cgroup_link *link,
 627                                enum bpf_attach_type type, u32 flags)
 628 {
 629         u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
 630         struct bpf_prog *old_prog = NULL;
 631         struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 632         struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 633         struct bpf_prog *new_prog = prog ? : link->link.prog;
 634         enum cgroup_bpf_attach_type atype;
 635         struct bpf_prog_list *pl;
 636         struct hlist_head *progs;
 637         int err;
 638
 639         if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
 640             ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
 641                 /* invalid combination */
 642                 return -EINVAL;
 643         if (link && (prog || replace_prog))
 644                 /* only either link or prog/replace_prog can be specified */
 645                 return -EINVAL;
 646         if (!!replace_prog != !!(flags & BPF_F_REPLACE))
 647                 /* replace_prog implies BPF_F_REPLACE, and vice versa */
 648                 return -EINVAL;
 649
 650         atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
 651         if (atype < 0)
 652                 return -EINVAL;
 653
 654         progs = &cgrp->bpf.progs[atype];
 655
 656         if (!hierarchy_allows_attach(cgrp, atype))
 657                 return -EPERM;
 658
 659         if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
 660                 /* Disallow attaching non-overridable on top
 661                  * of existing overridable in this cgroup.
 662                  * Disallow attaching multi-prog if overridable or none
 663                  */
 664                 return -EPERM;
 665
 666         if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 667                 return -E2BIG;
 668
 669         pl = find_attach_entry(progs, prog, link, replace_prog,
 670                                flags & BPF_F_ALLOW_MULTI);
 671         if (IS_ERR(pl))
 672                 return PTR_ERR(pl);
 673
 674         if (bpf_cgroup_storages_alloc(storage, new_storage, type,
 675                                       prog ? : link->link.prog, cgrp))
 676                 return -ENOMEM;
 677
 678         if (pl) {
 679                 old_prog = pl->prog;
 680         } else {
 681                 struct hlist_node *last = NULL;
 682
 683                 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 684                 if (!pl) {
 685                         bpf_cgroup_storages_free(new_storage);
 686                         return -ENOMEM;
 687                 }
 688                 if (hlist_empty(progs))
 689                         hlist_add_head(&pl->node, progs);
 690                 else
 691                         hlist_for_each(last, progs) {
 692                                 if (last->next)
 693                                         continue;
 694                                 hlist_add_behind(&pl->node, last);
 695                                 break;
 696                         }
 697         }
 698
 699         pl->prog = prog;
 700         pl->link = link;
 701         bpf_cgroup_storages_assign(pl->storage, storage);
 702         cgrp->bpf.flags[atype] = saved_flags;
 703
 704         if (type == BPF_LSM_CGROUP) {
 705                 err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
 706                 if (err)
 707                         goto cleanup;
 708         }
 709
 710         err = update_effective_progs(cgrp, atype);
 711         if (err)
 712                 goto cleanup_trampoline;
 713
 714         if (old_prog) {
 715                 if (type == BPF_LSM_CGROUP)
 716                         bpf_trampoline_unlink_cgroup_shim(old_prog);
 717                 bpf_prog_put(old_prog);
 718         } else {
 719                 static_branch_inc(&cgroup_bpf_enabled_key[atype]);
 720         }
 721         bpf_cgroup_storages_link(new_storage, cgrp, type);
 722         return 0;
 723
 724 cleanup_trampoline:
 725         if (type == BPF_LSM_CGROUP)
 726                 bpf_trampoline_unlink_cgroup_shim(new_prog);
 727
 728 cleanup:
 729         if (old_prog) {
 730                 pl->prog = old_prog;
 731                 pl->link = NULL;
 732         }
 733         bpf_cgroup_storages_free(new_storage);
 734         if (!old_prog) {
 735                 hlist_del(&pl->node);
 736                 kfree(pl);
 737         }
 738         return err;
 739 }
 740
 741 static int cgroup_bpf_attach(struct cgroup *cgrp,
 742                              struct bpf_prog *prog, struct bpf_prog *replace_prog,
 743                              struct bpf_cgroup_link *link,
 744                              enum bpf_attach_type type,
 745                              u32 flags)
 746 {
 747         int ret;
 748
 749         cgroup_lock();
 750         ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
 751         cgroup_unlock();
 752         return ret;
 753 }
 754
 755 /* Swap updated BPF program for given link in effective program arrays across
 756  * all descendant cgroups. This function is guaranteed to succeed.
 757  */
 758 static void replace_effective_prog(struct cgroup *cgrp,
 759                                    enum cgroup_bpf_attach_type atype,
 760                                    struct bpf_cgroup_link *link)
 761 {
 762         struct bpf_prog_array_item *item;
 763         struct cgroup_subsys_state *css;
 764         struct bpf_prog_array *progs;
 765         struct bpf_prog_list *pl;
 766         struct hlist_head *head;
 767         struct cgroup *cg;
 768         int pos;
 769
 770         css_for_each_descendant_pre(css, &cgrp->self) {
 771                 struct cgroup *desc = container_of(css, struct cgroup, self);
 772
 773                 if (percpu_ref_is_zero(&desc->bpf.refcnt))
 774                         continue;
 775
 776                 /* find position of link in effective progs array */
 777                 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
 778                         if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 779                                 continue;
 780
 781                         head = &cg->bpf.progs[atype];
 782                         hlist_for_each_entry(pl, head, node) {
 783                                 if (!prog_list_prog(pl))
 784                                         continue;
 785                                 if (pl->link == link)
 786                                         goto found;
 787                                 pos++;
 788                         }
 789                 }
 790 found:
 791                 BUG_ON(!cg);
 792                 progs = rcu_dereference_protected(
 793                                 desc->bpf.effective[atype],
 794                                 lockdep_is_held(&cgroup_mutex));
 795                 item = &progs->items[pos];
 796                 WRITE_ONCE(item->prog, link->link.prog);
 797         }
 798 }
 799
 800 /**
 801  * __cgroup_bpf_replace() - Replace link's program and propagate the change
 802  *                          to descendants
 803  * @cgrp: The cgroup which descendants to traverse
 804  * @link: A link for which to replace BPF program
 805  * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
 806  *            incremented
 807  *
 808  * Must be called with cgroup_mutex held.
 809  */
 810 static int __cgroup_bpf_replace(struct cgroup *cgrp,
 811                                 struct bpf_cgroup_link *link,
 812                                 struct bpf_prog *new_prog)
 813 {
 814         enum cgroup_bpf_attach_type atype;
 815         struct bpf_prog *old_prog;
 816         struct bpf_prog_list *pl;
 817         struct hlist_head *progs;
 818         bool found = false;
 819
 820         atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
 821         if (atype < 0)
 822                 return -EINVAL;
 823
 824         progs = &cgrp->bpf.progs[atype];
 825
 826         if (link->link.prog->type != new_prog->type)
 827                 return -EINVAL;
 828
 829         hlist_for_each_entry(pl, progs, node) {
 830                 if (pl->link == link) {
 831                         found = true;
 832                         break;
 833                 }
 834         }
 835         if (!found)
 836                 return -ENOENT;
 837
 838         old_prog = xchg(&link->link.prog, new_prog);
 839         replace_effective_prog(cgrp, atype, link);
 840         bpf_prog_put(old_prog);
 841         return 0;
 842 }
 843
 844 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
 845                               struct bpf_prog *old_prog)
 846 {
 847         struct bpf_cgroup_link *cg_link;
 848         int ret;
 849
 850         cg_link = container_of(link, struct bpf_cgroup_link, link);
 851
 852         cgroup_lock();
 853         /* link might have been auto-released by dying cgroup, so fail */
 854         if (!cg_link->cgroup) {
 855                 ret = -ENOLINK;
 856                 goto out_unlock;
 857         }
 858         if (old_prog && link->prog != old_prog) {
 859                 ret = -EPERM;
 860                 goto out_unlock;
 861         }
 862         ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
 863 out_unlock:
 864         cgroup_unlock();
 865         return ret;
 866 }
 867
 868 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
 869                                                struct bpf_prog *prog,
 870                                                struct bpf_cgroup_link *link,
 871                                                bool allow_multi)
 872 {
 873         struct bpf_prog_list *pl;
 874
 875         if (!allow_multi) {
 876                 if (hlist_empty(progs))
 877                         /* report error when trying to detach and nothing is attached */
 878                         return ERR_PTR(-ENOENT);
 879
 880                 /* to maintain backward compatibility NONE and OVERRIDE cgroups
 881                  * allow detaching with invalid FD (prog==NULL) in legacy mode
 882                  */
 883                 return hlist_entry(progs->first, typeof(*pl), node);
 884         }
 885
 886         if (!prog && !link)
 887                 /* to detach MULTI prog the user has to specify valid FD
 888                  * of the program or link to be detached
 889                  */
 890                 return ERR_PTR(-EINVAL);
 891
 892         /* find the prog or link and detach it */
 893         hlist_for_each_entry(pl, progs, node) {
 894                 if (pl->prog == prog && pl->link == link)
 895                         return pl;
 896         }
 897         return ERR_PTR(-ENOENT);
 898 }
 899
 900 /**
 901  * purge_effective_progs() - After compute_effective_progs fails to alloc new
 902  *                           cgrp->bpf.inactive table we can recover by
 903  *                           recomputing the array in place.
 904  *
 905  * @cgrp: The cgroup which descendants to travers
 906  * @prog: A program to detach or NULL
 907  * @link: A link to detach or NULL
 908  * @atype: Type of detach operation
 909  */
 910 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
 911                                   struct bpf_cgroup_link *link,
 912                                   enum cgroup_bpf_attach_type atype)
 913 {
 914         struct cgroup_subsys_state *css;
 915         struct bpf_prog_array *progs;
 916         struct bpf_prog_list *pl;
 917         struct hlist_head *head;
 918         struct cgroup *cg;
 919         int pos;
 920
 921         /* recompute effective prog array in place */
 922         css_for_each_descendant_pre(css, &cgrp->self) {
 923                 struct cgroup *desc = container_of(css, struct cgroup, self);
 924
 925                 if (percpu_ref_is_zero(&desc->bpf.refcnt))
 926                         continue;
 927
 928                 /* find position of link or prog in effective progs array */
 929                 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
 930                         if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
 931                                 continue;
 932
 933                         head = &cg->bpf.progs[atype];
 934                         hlist_for_each_entry(pl, head, node) {
 935                                 if (!prog_list_prog(pl))
 936                                         continue;
 937                                 if (pl->prog == prog && pl->link == link)
 938                                         goto found;
 939                                 pos++;
 940                         }
 941                 }
 942
 943                 /* no link or prog match, skip the cgroup of this layer */
 944                 continue;
 945 found:
 946                 progs = rcu_dereference_protected(
 947                                 desc->bpf.effective[atype],
 948                                 lockdep_is_held(&cgroup_mutex));
 949
 950                 /* Remove the program from the array */
 951                 WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
 952                           "Failed to purge a prog from array at index %d", pos);
 953         }
 954 }
 955
 956 /**
 957  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
 958  *                         propagate the change to descendants
 959  * @cgrp: The cgroup which descendants to traverse
 960  * @prog: A program to detach or NULL
 961  * @link: A link to detach or NULL
 962  * @type: Type of detach operation
 963  *
 964  * At most one of @prog or @link can be non-NULL.
 965  * Must be called with cgroup_mutex held.
 966  */
 967 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 968                                struct bpf_cgroup_link *link, enum bpf_attach_type type)
 969 {
 970         enum cgroup_bpf_attach_type atype;
 971         struct bpf_prog *old_prog;
 972         struct bpf_prog_list *pl;
 973         struct hlist_head *progs;
 974         u32 attach_btf_id = 0;
 975         u32 flags;
 976
 977         if (prog)
 978                 attach_btf_id = prog->aux->attach_btf_id;
 979         if (link)
 980                 attach_btf_id = link->link.prog->aux->attach_btf_id;
 981
 982         atype = bpf_cgroup_atype_find(type, attach_btf_id);
 983         if (atype < 0)
 984                 return -EINVAL;
 985
 986         progs = &cgrp->bpf.progs[atype];
 987         flags = cgrp->bpf.flags[atype];
 988
 989         if (prog && link)
 990                 /* only one of prog or link can be specified */
 991                 return -EINVAL;
 992
 993         pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
 994         if (IS_ERR(pl))
 995                 return PTR_ERR(pl);
 996
 997         /* mark it deleted, so it's ignored while recomputing effective */
 998         old_prog = pl->prog;
 999         pl->prog = NULL;
1000         pl->link = NULL;
1001
1002         if (update_effective_progs(cgrp, atype)) {
1003                 /* if update effective array failed replace the prog with a dummy prog*/
1004                 pl->prog = old_prog;
1005                 pl->link = link;
1006                 purge_effective_progs(cgrp, old_prog, link, atype);
1007         }
1008
1009         /* now can actually delete it from this cgroup list */
1010         hlist_del(&pl->node);
1011
1012         kfree(pl);
1013         if (hlist_empty(progs))
1014                 /* last program was detached, reset flags to zero */
1015                 cgrp->bpf.flags[atype] = 0;
1016         if (old_prog) {
1017                 if (type == BPF_LSM_CGROUP)
1018                         bpf_trampoline_unlink_cgroup_shim(old_prog);
1019                 bpf_prog_put(old_prog);
1020         }
1021         static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1022         return 0;
1023 }
1024
1025 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1026                              enum bpf_attach_type type)
1027 {
1028         int ret;
1029
1030         cgroup_lock();
1031         ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
1032         cgroup_unlock();
1033         return ret;
1034 }
1035
1036 /* Must be called with cgroup_mutex held to avoid races. */
1037 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1038                               union bpf_attr __user *uattr)
1039 {
1040         __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1041         bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1042         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1043         enum bpf_attach_type type = attr->query.attach_type;
1044         enum cgroup_bpf_attach_type from_atype, to_atype;
1045         enum cgroup_bpf_attach_type atype;
1046         struct bpf_prog_array *effective;
1047         int cnt, ret = 0, i;
1048         int total_cnt = 0;
1049         u32 flags;
1050
1051         if (effective_query && prog_attach_flags)
1052                 return -EINVAL;
1053
1054         if (type == BPF_LSM_CGROUP) {
1055                 if (!effective_query && attr->query.prog_cnt &&
1056                     prog_ids && !prog_attach_flags)
1057                         return -EINVAL;
1058
1059                 from_atype = CGROUP_LSM_START;
1060                 to_atype = CGROUP_LSM_END;
1061                 flags = 0;
1062         } else {
1063                 from_atype = to_cgroup_bpf_attach_type(type);
1064                 if (from_atype < 0)
1065                         return -EINVAL;
1066                 to_atype = from_atype;
1067                 flags = cgrp->bpf.flags[from_atype];
1068         }
1069
1070         for (atype = from_atype; atype <= to_atype; atype++) {
1071                 if (effective_query) {
1072                         effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1073                                                               lockdep_is_held(&cgroup_mutex));
1074                         total_cnt += bpf_prog_array_length(effective);
1075                 } else {
1076                         total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
1077                 }
1078         }
1079
1080         /* always output uattr->query.attach_flags as 0 during effective query */
1081         flags = effective_query ? 0 : flags;
1082         if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1083                 return -EFAULT;
1084         if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
1085                 return -EFAULT;
1086         if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
1087                 /* return early if user requested only program count + flags */
1088                 return 0;
1089
1090         if (attr->query.prog_cnt < total_cnt) {
1091                 total_cnt = attr->query.prog_cnt;
1092                 ret = -ENOSPC;
1093         }
1094
1095         for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1096                 if (effective_query) {
1097                         effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1098                                                               lockdep_is_held(&cgroup_mutex));
1099                         cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1100                         ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1101                 } else {
1102                         struct hlist_head *progs;
1103                         struct bpf_prog_list *pl;
1104                         struct bpf_prog *prog;
1105                         u32 id;
1106
1107                         progs = &cgrp->bpf.progs[atype];
1108                         cnt = min_t(int, prog_list_length(progs), total_cnt);
1109                         i = 0;
1110                         hlist_for_each_entry(pl, progs, node) {
1111                                 prog = prog_list_prog(pl);
1112                                 id = prog->aux->id;
1113                                 if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1114                                         return -EFAULT;
1115                                 if (++i == cnt)
1116                                         break;
1117                         }
1118
1119                         if (prog_attach_flags) {
1120                                 flags = cgrp->bpf.flags[atype];
1121
1122                                 for (i = 0; i < cnt; i++)
1123                                         if (copy_to_user(prog_attach_flags + i,
1124                                                          &flags, sizeof(flags)))
1125                                                 return -EFAULT;
1126                                 prog_attach_flags += cnt;
1127                         }
1128                 }
1129
1130                 prog_ids += cnt;
1131                 total_cnt -= cnt;
1132         }
1133         return ret;
1134 }
1135
1136 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1137                             union bpf_attr __user *uattr)
1138 {
1139         int ret;
1140
1141         cgroup_lock();
1142         ret = __cgroup_bpf_query(cgrp, attr, uattr);
1143         cgroup_unlock();
1144         return ret;
1145 }
1146
1147 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1148                            enum bpf_prog_type ptype, struct bpf_prog *prog)
1149 {
1150         struct bpf_prog *replace_prog = NULL;
1151         struct cgroup *cgrp;
1152         int ret;
1153
1154         cgrp = cgroup_get_from_fd(attr->target_fd);
1155         if (IS_ERR(cgrp))
1156                 return PTR_ERR(cgrp);
1157
1158         if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1159             (attr->attach_flags & BPF_F_REPLACE)) {
1160                 replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1161                 if (IS_ERR(replace_prog)) {
1162                         cgroup_put(cgrp);
1163                         return PTR_ERR(replace_prog);
1164                 }
1165         }
1166
1167         ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1168                                 attr->attach_type, attr->attach_flags);
1169
1170         if (replace_prog)
1171                 bpf_prog_put(replace_prog);
1172         cgroup_put(cgrp);
1173         return ret;
1174 }
1175
1176 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1177 {
1178         struct bpf_prog *prog;
1179         struct cgroup *cgrp;
1180         int ret;
1181
1182         cgrp = cgroup_get_from_fd(attr->target_fd);
1183         if (IS_ERR(cgrp))
1184                 return PTR_ERR(cgrp);
1185
1186         prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1187         if (IS_ERR(prog))
1188                 prog = NULL;
1189
1190         ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
1191         if (prog)
1192                 bpf_prog_put(prog);
1193
1194         cgroup_put(cgrp);
1195         return ret;
1196 }
1197
1198 static void bpf_cgroup_link_release(struct bpf_link *link)
1199 {
1200         struct bpf_cgroup_link *cg_link =
1201                 container_of(link, struct bpf_cgroup_link, link);
1202         struct cgroup *cg;
1203
1204         /* link might have been auto-detached by dying cgroup already,
1205          * in that case our work is done here
1206          */
1207         if (!cg_link->cgroup)
1208                 return;
1209
1210         cgroup_lock();
1211
1212         /* re-check cgroup under lock again */
1213         if (!cg_link->cgroup) {
1214                 cgroup_unlock();
1215                 return;
1216         }
1217
1218         WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1219                                     cg_link->type));
1220         if (cg_link->type == BPF_LSM_CGROUP)
1221                 bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1222
1223         cg = cg_link->cgroup;
1224         cg_link->cgroup = NULL;
1225
1226         cgroup_unlock();
1227
1228         cgroup_put(cg);
1229 }
1230
1231 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1232 {
1233         struct bpf_cgroup_link *cg_link =
1234                 container_of(link, struct bpf_cgroup_link, link);
1235
1236         kfree(cg_link);
1237 }
1238
1239 static int bpf_cgroup_link_detach(struct bpf_link *link)
1240 {
1241         bpf_cgroup_link_release(link);
1242
1243         return 0;
1244 }
1245
1246 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1247                                         struct seq_file *seq)
1248 {
1249         struct bpf_cgroup_link *cg_link =
1250                 container_of(link, struct bpf_cgroup_link, link);
1251         u64 cg_id = 0;
1252
1253         cgroup_lock();
1254         if (cg_link->cgroup)
1255                 cg_id = cgroup_id(cg_link->cgroup);
1256         cgroup_unlock();
1257
1258         seq_printf(seq,
1259                    "cgroup_id:\t%llu\n"
1260                    "attach_type:\t%d\n",
1261                    cg_id,
1262                    cg_link->type);
1263 }
1264
1265 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1266                                           struct bpf_link_info *info)
1267 {
1268         struct bpf_cgroup_link *cg_link =
1269                 container_of(link, struct bpf_cgroup_link, link);
1270         u64 cg_id = 0;
1271
1272         cgroup_lock();
1273         if (cg_link->cgroup)
1274                 cg_id = cgroup_id(cg_link->cgroup);
1275         cgroup_unlock();
1276
1277         info->cgroup.cgroup_id = cg_id;
1278         info->cgroup.attach_type = cg_link->type;
1279         return 0;
1280 }
1281
1282 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1283         .release = bpf_cgroup_link_release,
1284         .dealloc = bpf_cgroup_link_dealloc,
1285         .detach = bpf_cgroup_link_detach,
1286         .update_prog = cgroup_bpf_replace,
1287         .show_fdinfo = bpf_cgroup_link_show_fdinfo,
1288         .fill_link_info = bpf_cgroup_link_fill_link_info,
1289 };
1290
1291 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1292 {
1293         struct bpf_link_primer link_primer;
1294         struct bpf_cgroup_link *link;
1295         struct cgroup *cgrp;
1296         int err;
1297
1298         if (attr->link_create.flags)
1299                 return -EINVAL;
1300
1301         cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1302         if (IS_ERR(cgrp))
1303                 return PTR_ERR(cgrp);
1304
1305         link = kzalloc(sizeof(*link), GFP_USER);
1306         if (!link) {
1307                 err = -ENOMEM;
1308                 goto out_put_cgroup;
1309         }
1310         bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1311                       prog);
1312         link->cgroup = cgrp;
1313         link->type = attr->link_create.attach_type;
1314
1315         err = bpf_link_prime(&link->link, &link_primer);
1316         if (err) {
1317                 kfree(link);
1318                 goto out_put_cgroup;
1319         }
1320
1321         err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1322                                 link->type, BPF_F_ALLOW_MULTI);
1323         if (err) {
1324                 bpf_link_cleanup(&link_primer);
1325                 goto out_put_cgroup;
1326         }
1327
1328         return bpf_link_settle(&link_primer);
1329
1330 out_put_cgroup:
1331         cgroup_put(cgrp);
1332         return err;
1333 }
1334
1335 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1336                           union bpf_attr __user *uattr)
1337 {
1338         struct cgroup *cgrp;
1339         int ret;
1340
1341         cgrp = cgroup_get_from_fd(attr->query.target_fd);
1342         if (IS_ERR(cgrp))
1343                 return PTR_ERR(cgrp);
1344
1345         ret = cgroup_bpf_query(cgrp, attr, uattr);
1346
1347         cgroup_put(cgrp);
1348         return ret;
1349 }
1350
1351 /**
1352  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1353  * @sk: The socket sending or receiving traffic
1354  * @skb: The skb that is being sent or received
1355  * @atype: The type of program to be executed
1356  *
1357  * If no socket is passed, or the socket is not of type INET or INET6,
1358  * this function does nothing and returns 0.
1359  *
1360  * The program type passed in via @type must be suitable for network
1361  * filtering. No further check is performed to assert that.
1362  *
1363  * For egress packets, this function can return:
1364  *   NET_XMIT_SUCCESS    (0)    - continue with packet output
1365  *   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
1366  *   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
1367  *                                to call cwr
1368  *   -err                       - drop packet
1369  *
1370  * For ingress packets, this function will return -EPERM if any
1371  * attached program was found and if it returned != 1 during execution.
1372  * Otherwise 0 is returned.
1373  */
1374 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1375                                 struct sk_buff *skb,
1376                                 enum cgroup_bpf_attach_type atype)
1377 {
1378         unsigned int offset = -skb_network_offset(skb);
1379         struct sock *save_sk;
1380         void *saved_data_end;
1381         struct cgroup *cgrp;
1382         int ret;
1383
1384         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1385                 return 0;
1386
1387         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1388         save_sk = skb->sk;
1389         skb->sk = sk;
1390         __skb_push(skb, offset);
1391
1392         /* compute pointers for the bpf prog */
1393         bpf_compute_and_save_data_end(skb, &saved_data_end);
1394
1395         if (atype == CGROUP_INET_EGRESS) {
1396                 u32 flags = 0;
1397                 bool cn;
1398
1399                 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1400                                             __bpf_prog_run_save_cb, 0, &flags);
1401
1402                 /* Return values of CGROUP EGRESS BPF programs are:
1403                  *   0: drop packet
1404                  *   1: keep packet
1405                  *   2: drop packet and cn
1406                  *   3: keep packet and cn
1407                  *
1408                  * The returned value is then converted to one of the NET_XMIT
1409                  * or an error code that is then interpreted as drop packet
1410                  * (and no cn):
1411                  *   0: NET_XMIT_SUCCESS  skb should be transmitted
1412                  *   1: NET_XMIT_DROP     skb should be dropped and cn
1413                  *   2: NET_XMIT_CN       skb should be transmitted and cn
1414                  *   3: -err              skb should be dropped
1415                  */
1416
1417                 cn = flags & BPF_RET_SET_CN;
1418                 if (ret && !IS_ERR_VALUE((long)ret))
1419                         ret = -EFAULT;
1420                 if (!ret)
1421                         ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1422                 else
1423                         ret = (cn ? NET_XMIT_DROP : ret);
1424         } else {
1425                 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1426                                             skb, __bpf_prog_run_save_cb, 0,
1427                                             NULL);
1428                 if (ret && !IS_ERR_VALUE((long)ret))
1429                         ret = -EFAULT;
1430         }
1431         bpf_restore_data_end(skb, saved_data_end);
1432         __skb_pull(skb, offset);
1433         skb->sk = save_sk;
1434
1435         return ret;
1436 }
1437 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1438
1439 /**
1440  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1441  * @sk: sock structure to manipulate
1442  * @atype: The type of program to be executed
1443  *
1444  * socket is passed is expected to be of type INET or INET6.
1445  *
1446  * The program type passed in via @type must be suitable for sock
1447  * filtering. No further check is performed to assert that.
1448  *
1449  * This function will return %-EPERM if any if an attached program was found
1450  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1451  */
1452 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1453                                enum cgroup_bpf_attach_type atype)
1454 {
1455         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1456
1457         return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1458                                      NULL);
1459 }
1460 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1461
1462 /**
1463  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1464  *                                       provided by user sockaddr
1465  * @sk: sock struct that will use sockaddr
1466  * @uaddr: sockaddr struct provided by user
1467  * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
1468  *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
1469  *            uaddr.
1470  * @atype: The type of program to be executed
1471  * @t_ctx: Pointer to attach type specific context
1472  * @flags: Pointer to u32 which contains higher bits of BPF program
1473  *         return value (OR'ed together).
1474  *
1475  * socket is expected to be of type INET, INET6 or UNIX.
1476  *
1477  * This function will return %-EPERM if an attached program is found and
1478  * returned value != 1 during execution. In all other cases, 0 is returned.
1479  */
1480 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1481                                       struct sockaddr *uaddr,
1482                                       int *uaddrlen,
1483                                       enum cgroup_bpf_attach_type atype,
1484                                       void *t_ctx,
1485                                       u32 *flags)
1486 {
1487         struct bpf_sock_addr_kern ctx = {
1488                 .sk = sk,
1489                 .uaddr = uaddr,
1490                 .t_ctx = t_ctx,
1491         };
1492         struct sockaddr_storage unspec;
1493         struct cgroup *cgrp;
1494         int ret;
1495
1496         /* Check socket family since not all sockets represent network
1497          * endpoint (e.g. AF_UNIX).
1498          */
1499         if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
1500             sk->sk_family != AF_UNIX)
1501                 return 0;
1502
1503         if (!ctx.uaddr) {
1504                 memset(&unspec, 0, sizeof(unspec));
1505                 ctx.uaddr = (struct sockaddr *)&unspec;
1506                 ctx.uaddrlen = 0;
1507         } else {
1508                 ctx.uaddrlen = *uaddrlen;
1509         }
1510
1511         cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1512         ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1513                                     0, flags);
1514
1515         if (!ret && uaddr)
1516                 *uaddrlen = ctx.uaddrlen;
1517
1518         return ret;
1519 }
1520 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1521
1522 /**
1523  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1524  * @sk: socket to get cgroup from
1525  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1526  * sk with connection information (IP addresses, etc.) May not contain
1527  * cgroup info if it is a req sock.
1528  * @atype: The type of program to be executed
1529  *
1530  * socket passed is expected to be of type INET or INET6.
1531  *
1532  * The program type passed in via @type must be suitable for sock_ops
1533  * filtering. No further check is performed to assert that.
1534  *
1535  * This function will return %-EPERM if any if an attached program was found
1536  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1537  */
1538 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1539                                      struct bpf_sock_ops_kern *sock_ops,
1540                                      enum cgroup_bpf_attach_type atype)
1541 {
1542         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1543
1544         return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1545                                      0, NULL);
1546 }
1547 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1548
1549 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1550                                       short access, enum cgroup_bpf_attach_type atype)
1551 {
1552         struct cgroup *cgrp;
1553         struct bpf_cgroup_dev_ctx ctx = {
1554                 .access_type = (access << 16) | dev_type,
1555                 .major = major,
1556                 .minor = minor,
1557         };
1558         int ret;
1559
1560         rcu_read_lock();
1561         cgrp = task_dfl_cgroup(current);
1562         ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1563                                     NULL);
1564         rcu_read_unlock();
1565
1566         return ret;
1567 }
1568
1569 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
1570 {
1571         /* flags argument is not used now,
1572          * but provides an ability to extend the API.
1573          * verifier checks that its value is correct.
1574          */
1575         enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
1576         struct bpf_cgroup_storage *storage;
1577         struct bpf_cg_run_ctx *ctx;
1578         void *ptr;
1579
1580         /* get current cgroup storage from BPF run context */
1581         ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1582         storage = ctx->prog_item->cgroup_storage[stype];
1583
1584         if (stype == BPF_CGROUP_STORAGE_SHARED)
1585                 ptr = &READ_ONCE(storage->buf)->data[0];
1586         else
1587                 ptr = this_cpu_ptr(storage->percpu_buf);
1588
1589         return (unsigned long)ptr;
1590 }
1591
1592 const struct bpf_func_proto bpf_get_local_storage_proto = {
1593         .func           = bpf_get_local_storage,
1594         .gpl_only       = false,
1595         .ret_type       = RET_PTR_TO_MAP_VALUE,
1596         .arg1_type      = ARG_CONST_MAP_PTR,
1597         .arg2_type      = ARG_ANYTHING,
1598 };
1599
1600 BPF_CALL_0(bpf_get_retval)
1601 {
1602         struct bpf_cg_run_ctx *ctx =
1603                 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1604
1605         return ctx->retval;
1606 }
1607
1608 const struct bpf_func_proto bpf_get_retval_proto = {
1609         .func           = bpf_get_retval,
1610         .gpl_only       = false,
1611         .ret_type       = RET_INTEGER,
1612 };
1613
1614 BPF_CALL_1(bpf_set_retval, int, retval)
1615 {
1616         struct bpf_cg_run_ctx *ctx =
1617                 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1618
1619         ctx->retval = retval;
1620         return 0;
1621 }
1622
1623 const struct bpf_func_proto bpf_set_retval_proto = {
1624         .func           = bpf_set_retval,
1625         .gpl_only       = false,
1626         .ret_type       = RET_INTEGER,
1627         .arg1_type      = ARG_ANYTHING,
1628 };
1629
1630 static const struct bpf_func_proto *
1631 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1632 {
1633         const struct bpf_func_proto *func_proto;
1634
1635         func_proto = cgroup_common_func_proto(func_id, prog);
1636         if (func_proto)
1637                 return func_proto;
1638
1639         func_proto = cgroup_current_func_proto(func_id, prog);
1640         if (func_proto)
1641                 return func_proto;
1642
1643         switch (func_id) {
1644         case BPF_FUNC_perf_event_output:
1645                 return &bpf_event_output_data_proto;
1646         default:
1647                 return bpf_base_func_proto(func_id, prog);
1648         }
1649 }
1650
1651 static bool cgroup_dev_is_valid_access(int off, int size,
1652                                        enum bpf_access_type type,
1653                                        const struct bpf_prog *prog,
1654                                        struct bpf_insn_access_aux *info)
1655 {
1656         const int size_default = sizeof(__u32);
1657
1658         if (type == BPF_WRITE)
1659                 return false;
1660
1661         if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1662                 return false;
1663         /* The verifier guarantees that size > 0. */
1664         if (off % size != 0)
1665                 return false;
1666
1667         switch (off) {
1668         case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1669                 bpf_ctx_record_field_size(info, size_default);
1670                 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1671                         return false;
1672                 break;
1673         default:
1674                 if (size != size_default)
1675                         return false;
1676         }
1677
1678         return true;
1679 }
1680
1681 const struct bpf_prog_ops cg_dev_prog_ops = {
1682 };
1683
1684 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1685         .get_func_proto         = cgroup_dev_func_proto,
1686         .is_valid_access        = cgroup_dev_is_valid_access,
1687 };
1688
1689 /**
1690  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1691  *
1692  * @head: sysctl table header
1693  * @table: sysctl table
1694  * @write: sysctl is being read (= 0) or written (= 1)
1695  * @buf: pointer to buffer (in and out)
1696  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1697  *      result is size of @new_buf if program set new value, initial value
1698  *      otherwise
1699  * @ppos: value-result argument: value is position at which read from or write
1700  *      to sysctl is happening, result is new position if program overrode it,
1701  *      initial value otherwise
1702  * @atype: type of program to be executed
1703  *
1704  * Program is run when sysctl is being accessed, either read or written, and
1705  * can allow or deny such access.
1706  *
1707  * This function will return %-EPERM if an attached program is found and
1708  * returned value != 1 during execution. In all other cases 0 is returned.
1709  */
1710 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1711                                    const struct ctl_table *table, int write,
1712                                    char **buf, size_t *pcount, loff_t *ppos,
1713                                    enum cgroup_bpf_attach_type atype)
1714 {
1715         struct bpf_sysctl_kern ctx = {
1716                 .head = head,
1717                 .table = table,
1718                 .write = write,
1719                 .ppos = ppos,
1720                 .cur_val = NULL,
1721                 .cur_len = PAGE_SIZE,
1722                 .new_val = NULL,
1723                 .new_len = 0,
1724                 .new_updated = 0,
1725         };
1726         struct cgroup *cgrp;
1727         loff_t pos = 0;
1728         int ret;
1729
1730         ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1731         if (!ctx.cur_val ||
1732             table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1733                 /* Let BPF program decide how to proceed. */
1734                 ctx.cur_len = 0;
1735         }
1736
1737         if (write && *buf && *pcount) {
1738                 /* BPF program should be able to override new value with a
1739                  * buffer bigger than provided by user.
1740                  */
1741                 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1742                 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1743                 if (ctx.new_val) {
1744                         memcpy(ctx.new_val, *buf, ctx.new_len);
1745                 } else {
1746                         /* Let BPF program decide how to proceed. */
1747                         ctx.new_len = 0;
1748                 }
1749         }
1750
1751         rcu_read_lock();
1752         cgrp = task_dfl_cgroup(current);
1753         ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1754                                     NULL);
1755         rcu_read_unlock();
1756
1757         kfree(ctx.cur_val);
1758
1759         if (ret == 1 && ctx.new_updated) {
1760                 kfree(*buf);
1761                 *buf = ctx.new_val;
1762                 *pcount = ctx.new_len;
1763         } else {
1764                 kfree(ctx.new_val);
1765         }
1766
1767         return ret;
1768 }
1769
1770 #ifdef CONFIG_NET
1771 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1772                              struct bpf_sockopt_buf *buf)
1773 {
1774         if (unlikely(max_optlen < 0))
1775                 return -EINVAL;
1776
1777         if (unlikely(max_optlen > PAGE_SIZE)) {
1778                 /* We don't expose optvals that are greater than PAGE_SIZE
1779                  * to the BPF program.
1780                  */
1781                 max_optlen = PAGE_SIZE;
1782         }
1783
1784         if (max_optlen <= sizeof(buf->data)) {
1785                 /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1786                  * bytes avoid the cost of kzalloc.
1787                  */
1788                 ctx->optval = buf->data;
1789                 ctx->optval_end = ctx->optval + max_optlen;
1790                 return max_optlen;
1791         }
1792
1793         ctx->optval = kzalloc(max_optlen, GFP_USER);
1794         if (!ctx->optval)
1795                 return -ENOMEM;
1796
1797         ctx->optval_end = ctx->optval + max_optlen;
1798
1799         return max_optlen;
1800 }
1801
1802 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1803                              struct bpf_sockopt_buf *buf)
1804 {
1805         if (ctx->optval == buf->data)
1806                 return;
1807         kfree(ctx->optval);
1808 }
1809
1810 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1811                                   struct bpf_sockopt_buf *buf)
1812 {
1813         return ctx->optval != buf->data;
1814 }
1815
1816 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1817                                        int *optname, sockptr_t optval,
1818                                        int *optlen, char **kernel_optval)
1819 {
1820         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1821         struct bpf_sockopt_buf buf = {};
1822         struct bpf_sockopt_kern ctx = {
1823                 .sk = sk,
1824                 .level = *level,
1825                 .optname = *optname,
1826         };
1827         int ret, max_optlen;
1828
1829         /* Allocate a bit more than the initial user buffer for
1830          * BPF program. The canonical use case is overriding
1831          * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1832          */
1833         max_optlen = max_t(int, 16, *optlen);
1834         max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1835         if (max_optlen < 0)
1836                 return max_optlen;
1837
1838         ctx.optlen = *optlen;
1839
1840         if (copy_from_sockptr(ctx.optval, optval,
1841                               min(*optlen, max_optlen))) {
1842                 ret = -EFAULT;
1843                 goto out;
1844         }
1845
1846         lock_sock(sk);
1847         ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1848                                     &ctx, bpf_prog_run, 0, NULL);
1849         release_sock(sk);
1850
1851         if (ret)
1852                 goto out;
1853
1854         if (ctx.optlen == -1) {
1855                 /* optlen set to -1, bypass kernel */
1856                 ret = 1;
1857         } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1858                 /* optlen is out of bounds */
1859                 if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
1860                         pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
1861                                      ctx.optlen, max_optlen);
1862                         ret = 0;
1863                         goto out;
1864                 }
1865                 ret = -EFAULT;
1866         } else {
1867                 /* optlen within bounds, run kernel handler */
1868                 ret = 0;
1869
1870                 /* export any potential modifications */
1871                 *level = ctx.level;
1872                 *optname = ctx.optname;
1873
1874                 /* optlen == 0 from BPF indicates that we should
1875                  * use original userspace data.
1876                  */
1877                 if (ctx.optlen != 0) {
1878                         *optlen = ctx.optlen;
1879                         /* We've used bpf_sockopt_kern->buf as an intermediary
1880                          * storage, but the BPF program indicates that we need
1881                          * to pass this data to the kernel setsockopt handler.
1882                          * No way to export on-stack buf, have to allocate a
1883                          * new buffer.
1884                          */
1885                         if (!sockopt_buf_allocated(&ctx, &buf)) {
1886                                 void *p = kmalloc(ctx.optlen, GFP_USER);
1887
1888                                 if (!p) {
1889                                         ret = -ENOMEM;
1890                                         goto out;
1891                                 }
1892                                 memcpy(p, ctx.optval, ctx.optlen);
1893                                 *kernel_optval = p;
1894                         } else {
1895                                 *kernel_optval = ctx.optval;
1896                         }
1897                         /* export and don't free sockopt buf */
1898                         return 0;
1899                 }
1900         }
1901
1902 out:
1903         sockopt_free_buf(&ctx, &buf);
1904         return ret;
1905 }
1906
1907 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1908                                        int optname, sockptr_t optval,
1909                                        sockptr_t optlen, int max_optlen,
1910                                        int retval)
1911 {
1912         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1913         struct bpf_sockopt_buf buf = {};
1914         struct bpf_sockopt_kern ctx = {
1915                 .sk = sk,
1916                 .level = level,
1917                 .optname = optname,
1918                 .current_task = current,
1919         };
1920         int orig_optlen;
1921         int ret;
1922
1923         orig_optlen = max_optlen;
1924         ctx.optlen = max_optlen;
1925         max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1926         if (max_optlen < 0)
1927                 return max_optlen;
1928
1929         if (!retval) {
1930                 /* If kernel getsockopt finished successfully,
1931                  * copy whatever was returned to the user back
1932                  * into our temporary buffer. Set optlen to the
1933                  * one that kernel returned as well to let
1934                  * BPF programs inspect the value.
1935                  */
1936                 if (copy_from_sockptr(&ctx.optlen, optlen,
1937                                       sizeof(ctx.optlen))) {
1938                         ret = -EFAULT;
1939                         goto out;
1940                 }
1941
1942                 if (ctx.optlen < 0) {
1943                         ret = -EFAULT;
1944                         goto out;
1945                 }
1946                 orig_optlen = ctx.optlen;
1947
1948                 if (copy_from_sockptr(ctx.optval, optval,
1949                                       min(ctx.optlen, max_optlen))) {
1950                         ret = -EFAULT;
1951                         goto out;
1952                 }
1953         }
1954
1955         lock_sock(sk);
1956         ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1957                                     &ctx, bpf_prog_run, retval, NULL);
1958         release_sock(sk);
1959
1960         if (ret < 0)
1961                 goto out;
1962
1963         if (!sockptr_is_null(optval) &&
1964             (ctx.optlen > max_optlen || ctx.optlen < 0)) {
1965                 if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
1966                         pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
1967                                      ctx.optlen, max_optlen);
1968                         ret = retval;
1969                         goto out;
1970                 }
1971                 ret = -EFAULT;
1972                 goto out;
1973         }
1974
1975         if (ctx.optlen != 0) {
1976                 if (!sockptr_is_null(optval) &&
1977                     copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
1978                         ret = -EFAULT;
1979                         goto out;
1980                 }
1981                 if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
1982                         ret = -EFAULT;
1983                         goto out;
1984                 }
1985         }
1986
1987 out:
1988         sockopt_free_buf(&ctx, &buf);
1989         return ret;
1990 }
1991
1992 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1993                                             int optname, void *optval,
1994                                             int *optlen, int retval)
1995 {
1996         struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1997         struct bpf_sockopt_kern ctx = {
1998                 .sk = sk,
1999                 .level = level,
2000                 .optname = optname,
2001                 .optlen = *optlen,
2002                 .optval = optval,
2003                 .optval_end = optval + *optlen,
2004                 .current_task = current,
2005         };
2006         int ret;
2007
2008         /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
2009          * user data back into BPF buffer when reval != 0. This is
2010          * done as an optimization to avoid extra copy, assuming
2011          * kernel won't populate the data in case of an error.
2012          * Here we always pass the data and memset() should
2013          * be called if that data shouldn't be "exported".
2014          */
2015
2016         ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2017                                     &ctx, bpf_prog_run, retval, NULL);
2018         if (ret < 0)
2019                 return ret;
2020
2021         if (ctx.optlen > *optlen)
2022                 return -EFAULT;
2023
2024         /* BPF programs can shrink the buffer, export the modifications.
2025          */
2026         if (ctx.optlen != 0)
2027                 *optlen = ctx.optlen;
2028
2029         return ret;
2030 }
2031 #endif
2032
2033 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
2034                               size_t *lenp)
2035 {
2036         ssize_t tmp_ret = 0, ret;
2037
2038         if (dir->header.parent) {
2039                 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
2040                 if (tmp_ret < 0)
2041                         return tmp_ret;
2042         }
2043
2044         ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
2045         if (ret < 0)
2046                 return ret;
2047         *bufp += ret;
2048         *lenp -= ret;
2049         ret += tmp_ret;
2050
2051         /* Avoid leading slash. */
2052         if (!ret)
2053                 return ret;
2054
2055         tmp_ret = strscpy(*bufp, "/", *lenp);
2056         if (tmp_ret < 0)
2057                 return tmp_ret;
2058         *bufp += tmp_ret;
2059         *lenp -= tmp_ret;
2060
2061         return ret + tmp_ret;
2062 }
2063
2064 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
2065            size_t, buf_len, u64, flags)
2066 {
2067         ssize_t tmp_ret = 0, ret;
2068
2069         if (!buf)
2070                 return -EINVAL;
2071
2072         if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
2073                 if (!ctx->head)
2074                         return -EINVAL;
2075                 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
2076                 if (tmp_ret < 0)
2077                         return tmp_ret;
2078         }
2079
2080         ret = strscpy(buf, ctx->table->procname, buf_len);
2081
2082         return ret < 0 ? ret : tmp_ret + ret;
2083 }
2084
2085 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2086         .func           = bpf_sysctl_get_name,
2087         .gpl_only       = false,
2088         .ret_type       = RET_INTEGER,
2089         .arg1_type      = ARG_PTR_TO_CTX,
2090         .arg2_type      = ARG_PTR_TO_MEM,
2091         .arg3_type      = ARG_CONST_SIZE,
2092         .arg4_type      = ARG_ANYTHING,
2093 };
2094
2095 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
2096                              size_t src_len)
2097 {
2098         if (!dst)
2099                 return -EINVAL;
2100
2101         if (!dst_len)
2102                 return -E2BIG;
2103
2104         if (!src || !src_len) {
2105                 memset(dst, 0, dst_len);
2106                 return -EINVAL;
2107         }
2108
2109         memcpy(dst, src, min(dst_len, src_len));
2110
2111         if (dst_len > src_len) {
2112                 memset(dst + src_len, '\0', dst_len - src_len);
2113                 return src_len;
2114         }
2115
2116         dst[dst_len - 1] = '\0';
2117
2118         return -E2BIG;
2119 }
2120
2121 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2122            char *, buf, size_t, buf_len)
2123 {
2124         return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
2125 }
2126
2127 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2128         .func           = bpf_sysctl_get_current_value,
2129         .gpl_only       = false,
2130         .ret_type       = RET_INTEGER,
2131         .arg1_type      = ARG_PTR_TO_CTX,
2132         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
2133         .arg3_type      = ARG_CONST_SIZE,
2134 };
2135
2136 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
2137            size_t, buf_len)
2138 {
2139         if (!ctx->write) {
2140                 if (buf && buf_len)
2141                         memset(buf, '\0', buf_len);
2142                 return -EINVAL;
2143         }
2144         return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
2145 }
2146
2147 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2148         .func           = bpf_sysctl_get_new_value,
2149         .gpl_only       = false,
2150         .ret_type       = RET_INTEGER,
2151         .arg1_type      = ARG_PTR_TO_CTX,
2152         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
2153         .arg3_type      = ARG_CONST_SIZE,
2154 };
2155
2156 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2157            const char *, buf, size_t, buf_len)
2158 {
2159         if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
2160                 return -EINVAL;
2161
2162         if (buf_len > PAGE_SIZE - 1)
2163                 return -E2BIG;
2164
2165         memcpy(ctx->new_val, buf, buf_len);
2166         ctx->new_len = buf_len;
2167         ctx->new_updated = 1;
2168
2169         return 0;
2170 }
2171
2172 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2173         .func           = bpf_sysctl_set_new_value,
2174         .gpl_only       = false,
2175         .ret_type       = RET_INTEGER,
2176         .arg1_type      = ARG_PTR_TO_CTX,
2177         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
2178         .arg3_type      = ARG_CONST_SIZE,
2179 };
2180
2181 static const struct bpf_func_proto *
2182 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2183 {
2184         const struct bpf_func_proto *func_proto;
2185
2186         func_proto = cgroup_common_func_proto(func_id, prog);
2187         if (func_proto)
2188                 return func_proto;
2189
2190         func_proto = cgroup_current_func_proto(func_id, prog);
2191         if (func_proto)
2192                 return func_proto;
2193
2194         switch (func_id) {
2195         case BPF_FUNC_sysctl_get_name:
2196                 return &bpf_sysctl_get_name_proto;
2197         case BPF_FUNC_sysctl_get_current_value:
2198                 return &bpf_sysctl_get_current_value_proto;
2199         case BPF_FUNC_sysctl_get_new_value:
2200                 return &bpf_sysctl_get_new_value_proto;
2201         case BPF_FUNC_sysctl_set_new_value:
2202                 return &bpf_sysctl_set_new_value_proto;
2203         case BPF_FUNC_ktime_get_coarse_ns:
2204                 return &bpf_ktime_get_coarse_ns_proto;
2205         case BPF_FUNC_perf_event_output:
2206                 return &bpf_event_output_data_proto;
2207         default:
2208                 return bpf_base_func_proto(func_id, prog);
2209         }
2210 }
2211
2212 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2213                                    const struct bpf_prog *prog,
2214                                    struct bpf_insn_access_aux *info)
2215 {
2216         const int size_default = sizeof(__u32);
2217
2218         if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2219                 return false;
2220
2221         switch (off) {
2222         case bpf_ctx_range(struct bpf_sysctl, write):
2223                 if (type != BPF_READ)
2224                         return false;
2225                 bpf_ctx_record_field_size(info, size_default);
2226                 return bpf_ctx_narrow_access_ok(off, size, size_default);
2227         case bpf_ctx_range(struct bpf_sysctl, file_pos):
2228                 if (type == BPF_READ) {
2229                         bpf_ctx_record_field_size(info, size_default);
2230                         return bpf_ctx_narrow_access_ok(off, size, size_default);
2231                 } else {
2232                         return size == size_default;
2233                 }
2234         default:
2235                 return false;
2236         }
2237 }
2238
2239 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2240                                      const struct bpf_insn *si,
2241                                      struct bpf_insn *insn_buf,
2242                                      struct bpf_prog *prog, u32 *target_size)
2243 {
2244         struct bpf_insn *insn = insn_buf;
2245         u32 read_size;
2246
2247         switch (si->off) {
2248         case offsetof(struct bpf_sysctl, write):
2249                 *insn++ = BPF_LDX_MEM(
2250                         BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2251                         bpf_target_off(struct bpf_sysctl_kern, write,
2252                                        sizeof_field(struct bpf_sysctl_kern,
2253                                                     write),
2254                                        target_size));
2255                 break;
2256         case offsetof(struct bpf_sysctl, file_pos):
2257                 /* ppos is a pointer so it should be accessed via indirect
2258                  * loads and stores. Also for stores additional temporary
2259                  * register is used since neither src_reg nor dst_reg can be
2260                  * overridden.
2261                  */
2262                 if (type == BPF_WRITE) {
2263                         int treg = BPF_REG_9;
2264
2265                         if (si->src_reg == treg || si->dst_reg == treg)
2266                                 --treg;
2267                         if (si->src_reg == treg || si->dst_reg == treg)
2268                                 --treg;
2269                         *insn++ = BPF_STX_MEM(
2270                                 BPF_DW, si->dst_reg, treg,
2271                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
2272                         *insn++ = BPF_LDX_MEM(
2273                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2274                                 treg, si->dst_reg,
2275                                 offsetof(struct bpf_sysctl_kern, ppos));
2276                         *insn++ = BPF_RAW_INSN(
2277                                 BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
2278                                 treg, si->src_reg,
2279                                 bpf_ctx_narrow_access_offset(
2280                                         0, sizeof(u32), sizeof(loff_t)),
2281                                 si->imm);
2282                         *insn++ = BPF_LDX_MEM(
2283                                 BPF_DW, treg, si->dst_reg,
2284                                 offsetof(struct bpf_sysctl_kern, tmp_reg));
2285                 } else {
2286                         *insn++ = BPF_LDX_MEM(
2287                                 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2288                                 si->dst_reg, si->src_reg,
2289                                 offsetof(struct bpf_sysctl_kern, ppos));
2290                         read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2291                         *insn++ = BPF_LDX_MEM(
2292                                 BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2293                                 bpf_ctx_narrow_access_offset(
2294                                         0, read_size, sizeof(loff_t)));
2295                 }
2296                 *target_size = sizeof(u32);
2297                 break;
2298         }
2299
2300         return insn - insn_buf;
2301 }
2302
2303 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2304         .get_func_proto         = sysctl_func_proto,
2305         .is_valid_access        = sysctl_is_valid_access,
2306         .convert_ctx_access     = sysctl_convert_ctx_access,
2307 };
2308
2309 const struct bpf_prog_ops cg_sysctl_prog_ops = {
2310 };
2311
2312 #ifdef CONFIG_NET
2313 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2314 {
2315         const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2316
2317         return net->net_cookie;
2318 }
2319
2320 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2321         .func           = bpf_get_netns_cookie_sockopt,
2322         .gpl_only       = false,
2323         .ret_type       = RET_INTEGER,
2324         .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
2325 };
2326 #endif
2327
2328 static const struct bpf_func_proto *
2329 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2330 {
2331         const struct bpf_func_proto *func_proto;
2332
2333         func_proto = cgroup_common_func_proto(func_id, prog);
2334         if (func_proto)
2335                 return func_proto;
2336
2337         func_proto = cgroup_current_func_proto(func_id, prog);
2338         if (func_proto)
2339                 return func_proto;
2340
2341         switch (func_id) {
2342 #ifdef CONFIG_NET
2343         case BPF_FUNC_get_netns_cookie:
2344                 return &bpf_get_netns_cookie_sockopt_proto;
2345         case BPF_FUNC_sk_storage_get:
2346                 return &bpf_sk_storage_get_proto;
2347         case BPF_FUNC_sk_storage_delete:
2348                 return &bpf_sk_storage_delete_proto;
2349         case BPF_FUNC_setsockopt:
2350                 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2351                         return &bpf_sk_setsockopt_proto;
2352                 return NULL;
2353         case BPF_FUNC_getsockopt:
2354                 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2355                         return &bpf_sk_getsockopt_proto;
2356                 return NULL;
2357 #endif
2358 #ifdef CONFIG_INET
2359         case BPF_FUNC_tcp_sock:
2360                 return &bpf_tcp_sock_proto;
2361 #endif
2362         case BPF_FUNC_perf_event_output:
2363                 return &bpf_event_output_data_proto;
2364         default:
2365                 return bpf_base_func_proto(func_id, prog);
2366         }
2367 }
2368
2369 static bool cg_sockopt_is_valid_access(int off, int size,
2370                                        enum bpf_access_type type,
2371                                        const struct bpf_prog *prog,
2372                                        struct bpf_insn_access_aux *info)
2373 {
2374         const int size_default = sizeof(__u32);
2375
2376         if (off < 0 || off >= sizeof(struct bpf_sockopt))
2377                 return false;
2378
2379         if (off % size != 0)
2380                 return false;
2381
2382         if (type == BPF_WRITE) {
2383                 switch (off) {
2384                 case offsetof(struct bpf_sockopt, retval):
2385                         if (size != size_default)
2386                                 return false;
2387                         return prog->expected_attach_type ==
2388                                 BPF_CGROUP_GETSOCKOPT;
2389                 case offsetof(struct bpf_sockopt, optname):
2390                         fallthrough;
2391                 case offsetof(struct bpf_sockopt, level):
2392                         if (size != size_default)
2393                                 return false;
2394                         return prog->expected_attach_type ==
2395                                 BPF_CGROUP_SETSOCKOPT;
2396                 case offsetof(struct bpf_sockopt, optlen):
2397                         return size == size_default;
2398                 default:
2399                         return false;
2400                 }
2401         }
2402
2403         switch (off) {
2404         case offsetof(struct bpf_sockopt, sk):
2405                 if (size != sizeof(__u64))
2406                         return false;
2407                 info->reg_type = PTR_TO_SOCKET;
2408                 break;
2409         case offsetof(struct bpf_sockopt, optval):
2410                 if (size != sizeof(__u64))
2411                         return false;
2412                 info->reg_type = PTR_TO_PACKET;
2413                 break;
2414         case offsetof(struct bpf_sockopt, optval_end):
2415                 if (size != sizeof(__u64))
2416                         return false;
2417                 info->reg_type = PTR_TO_PACKET_END;
2418                 break;
2419         case offsetof(struct bpf_sockopt, retval):
2420                 if (size != size_default)
2421                         return false;
2422                 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2423         default:
2424                 if (size != size_default)
2425                         return false;
2426                 break;
2427         }
2428         return true;
2429 }
2430
2431 #define CG_SOCKOPT_READ_FIELD(F)                                        \
2432         BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),       \
2433                     si->dst_reg, si->src_reg,                           \
2434                     offsetof(struct bpf_sockopt_kern, F))
2435
2436 #define CG_SOCKOPT_WRITE_FIELD(F)                                       \
2437         BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) |    \
2438                       BPF_MEM | BPF_CLASS(si->code)),                   \
2439                      si->dst_reg, si->src_reg,                          \
2440                      offsetof(struct bpf_sockopt_kern, F),              \
2441                      si->imm)
2442
2443 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2444                                          const struct bpf_insn *si,
2445                                          struct bpf_insn *insn_buf,
2446                                          struct bpf_prog *prog,
2447                                          u32 *target_size)
2448 {
2449         struct bpf_insn *insn = insn_buf;
2450
2451         switch (si->off) {
2452         case offsetof(struct bpf_sockopt, sk):
2453                 *insn++ = CG_SOCKOPT_READ_FIELD(sk);
2454                 break;
2455         case offsetof(struct bpf_sockopt, level):
2456                 if (type == BPF_WRITE)
2457                         *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
2458                 else
2459                         *insn++ = CG_SOCKOPT_READ_FIELD(level);
2460                 break;
2461         case offsetof(struct bpf_sockopt, optname):
2462                 if (type == BPF_WRITE)
2463                         *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
2464                 else
2465                         *insn++ = CG_SOCKOPT_READ_FIELD(optname);
2466                 break;
2467         case offsetof(struct bpf_sockopt, optlen):
2468                 if (type == BPF_WRITE)
2469                         *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
2470                 else
2471                         *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
2472                 break;
2473         case offsetof(struct bpf_sockopt, retval):
2474                 BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2475
2476                 if (type == BPF_WRITE) {
2477                         int treg = BPF_REG_9;
2478
2479                         if (si->src_reg == treg || si->dst_reg == treg)
2480                                 --treg;
2481                         if (si->src_reg == treg || si->dst_reg == treg)
2482                                 --treg;
2483                         *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2484                                               offsetof(struct bpf_sockopt_kern, tmp_reg));
2485                         *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2486                                               treg, si->dst_reg,
2487                                               offsetof(struct bpf_sockopt_kern, current_task));
2488                         *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2489                                               treg, treg,
2490                                               offsetof(struct task_struct, bpf_ctx));
2491                         *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
2492                                                BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2493                                                treg, si->src_reg,
2494                                                offsetof(struct bpf_cg_run_ctx, retval),
2495                                                si->imm);
2496                         *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2497                                               offsetof(struct bpf_sockopt_kern, tmp_reg));
2498                 } else {
2499                         *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2500                                               si->dst_reg, si->src_reg,
2501                                               offsetof(struct bpf_sockopt_kern, current_task));
2502                         *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2503                                               si->dst_reg, si->dst_reg,
2504                                               offsetof(struct task_struct, bpf_ctx));
2505                         *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2506                                               si->dst_reg, si->dst_reg,
2507                                               offsetof(struct bpf_cg_run_ctx, retval));
2508                 }
2509                 break;
2510         case offsetof(struct bpf_sockopt, optval):
2511                 *insn++ = CG_SOCKOPT_READ_FIELD(optval);
2512                 break;
2513         case offsetof(struct bpf_sockopt, optval_end):
2514                 *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
2515                 break;
2516         }
2517
2518         return insn - insn_buf;
2519 }
2520
2521 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2522                                    bool direct_write,
2523                                    const struct bpf_prog *prog)
2524 {
2525         /* Nothing to do for sockopt argument. The data is kzalloc'ated.
2526          */
2527         return 0;
2528 }
2529
2530 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2531         .get_func_proto         = cg_sockopt_func_proto,
2532         .is_valid_access        = cg_sockopt_is_valid_access,
2533         .convert_ctx_access     = cg_sockopt_convert_ctx_access,
2534         .gen_prologue           = cg_sockopt_get_prologue,
2535 };
2536
2537 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2538 };
2539
2540 /* Common helpers for cgroup hooks. */
2541 const struct bpf_func_proto *
2542 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2543 {
2544         switch (func_id) {
2545         case BPF_FUNC_get_local_storage:
2546                 return &bpf_get_local_storage_proto;
2547         case BPF_FUNC_get_retval:
2548                 switch (prog->expected_attach_type) {
2549                 case BPF_CGROUP_INET_INGRESS:
2550                 case BPF_CGROUP_INET_EGRESS:
2551                 case BPF_CGROUP_SOCK_OPS:
2552                 case BPF_CGROUP_UDP4_RECVMSG:
2553                 case BPF_CGROUP_UDP6_RECVMSG:
2554                 case BPF_CGROUP_UNIX_RECVMSG:
2555                 case BPF_CGROUP_INET4_GETPEERNAME:
2556                 case BPF_CGROUP_INET6_GETPEERNAME:
2557                 case BPF_CGROUP_UNIX_GETPEERNAME:
2558                 case BPF_CGROUP_INET4_GETSOCKNAME:
2559                 case BPF_CGROUP_INET6_GETSOCKNAME:
2560                 case BPF_CGROUP_UNIX_GETSOCKNAME:
2561                         return NULL;
2562                 default:
2563                         return &bpf_get_retval_proto;
2564                 }
2565         case BPF_FUNC_set_retval:
2566                 switch (prog->expected_attach_type) {
2567                 case BPF_CGROUP_INET_INGRESS:
2568                 case BPF_CGROUP_INET_EGRESS:
2569                 case BPF_CGROUP_SOCK_OPS:
2570                 case BPF_CGROUP_UDP4_RECVMSG:
2571                 case BPF_CGROUP_UDP6_RECVMSG:
2572                 case BPF_CGROUP_UNIX_RECVMSG:
2573                 case BPF_CGROUP_INET4_GETPEERNAME:
2574                 case BPF_CGROUP_INET6_GETPEERNAME:
2575                 case BPF_CGROUP_UNIX_GETPEERNAME:
2576                 case BPF_CGROUP_INET4_GETSOCKNAME:
2577                 case BPF_CGROUP_INET6_GETSOCKNAME:
2578                 case BPF_CGROUP_UNIX_GETSOCKNAME:
2579                         return NULL;
2580                 default:
2581                         return &bpf_set_retval_proto;
2582                 }
2583         default:
2584                 return NULL;
2585         }
2586 }
2587
2588 /* Common helpers for cgroup hooks with valid process context. */
2589 const struct bpf_func_proto *
2590 cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2591 {
2592         switch (func_id) {
2593         case BPF_FUNC_get_current_uid_gid:
2594                 return &bpf_get_current_uid_gid_proto;
2595         case BPF_FUNC_get_current_comm:
2596                 return &bpf_get_current_comm_proto;
2597 #ifdef CONFIG_CGROUP_NET_CLASSID
2598         case BPF_FUNC_get_cgroup_classid:
2599                 return &bpf_get_cgroup_classid_curr_proto;
2600 #endif
2601         case BPF_FUNC_current_task_under_cgroup:
2602                 return &bpf_current_task_under_cgroup_proto;
2603         default:
2604                 return NULL;
2605         }
2606 }