arch/x86/kernel/cpu/intel_rdt_rdtgroup.c

   1 /*
   2  * User interface for Resource Alloction in Resource Director Technology(RDT)
   3  *
   4  * Copyright (C) 2016 Intel Corporation
   5  *
   6  * Author: Fenghua Yu <fenghua.yu@intel.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify it
   9  * under the terms and conditions of the GNU General Public License,
  10  * version 2, as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15  * more details.
  16  *
  17  * More information about RDT be found in the Intel (R) x86 Architecture
  18  * Software Developer Manual.
  19  */
  20
  21 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  22
  23 #include <linux/cacheinfo.h>
  24 #include <linux/cpu.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/fs.h>
  27 #include <linux/sysfs.h>
  28 #include <linux/kernfs.h>
  29 #include <linux/seq_buf.h>
  30 #include <linux/seq_file.h>
  31 #include <linux/sched/signal.h>
  32 #include <linux/sched/task.h>
  33 #include <linux/slab.h>
  34 #include <linux/task_work.h>
  35
  36 #include <uapi/linux/magic.h>
  37
  38 #include <asm/intel_rdt_sched.h>
  39 #include "intel_rdt.h"
  40
  41 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
  42 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  43 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  44 static struct kernfs_root *rdt_root;
  45 struct rdtgroup rdtgroup_default;
  46 LIST_HEAD(rdt_all_groups);
  47
  48 /* Kernel fs node for "info" directory under root */
  49 static struct kernfs_node *kn_info;
  50
  51 /* Kernel fs node for "mon_groups" directory under root */
  52 static struct kernfs_node *kn_mongrp;
  53
  54 /* Kernel fs node for "mon_data" directory under root */
  55 static struct kernfs_node *kn_mondata;
  56
  57 static struct seq_buf last_cmd_status;
  58 static char last_cmd_status_buf[512];
  59
  60 struct dentry *debugfs_resctrl;
  61
  62 void rdt_last_cmd_clear(void)
  63 {
  64         lockdep_assert_held(&rdtgroup_mutex);
  65         seq_buf_clear(&last_cmd_status);
  66 }
  67
  68 void rdt_last_cmd_puts(const char *s)
  69 {
  70         lockdep_assert_held(&rdtgroup_mutex);
  71         seq_buf_puts(&last_cmd_status, s);
  72 }
  73
  74 void rdt_last_cmd_printf(const char *fmt, ...)
  75 {
  76         va_list ap;
  77
  78         va_start(ap, fmt);
  79         lockdep_assert_held(&rdtgroup_mutex);
  80         seq_buf_vprintf(&last_cmd_status, fmt, ap);
  81         va_end(ap);
  82 }
  83
  84 /*
  85  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  86  * we can keep a bitmap of free CLOSIDs in a single integer.
  87  *
  88  * Using a global CLOSID across all resources has some advantages and
  89  * some drawbacks:
  90  * + We can simply set "current->closid" to assign a task to a resource
  91  *   group.
  92  * + Context switch code can avoid extra memory references deciding which
  93  *   CLOSID to load into the PQR_ASSOC MSR
  94  * - We give up some options in configuring resource groups across multi-socket
  95  *   systems.
  96  * - Our choices on how to configure each resource become progressively more
  97  *   limited as the number of resources grows.
  98  */
  99 static int closid_free_map;
 100 static int closid_free_map_len;
 101
 102 int closids_supported(void)
 103 {
 104         return closid_free_map_len;
 105 }
 106
 107 static void closid_init(void)
 108 {
 109         struct rdt_resource *r;
 110         int rdt_min_closid = 32;
 111
 112         /* Compute rdt_min_closid across all resources */
 113         for_each_alloc_enabled_rdt_resource(r)
 114                 rdt_min_closid = min(rdt_min_closid, r->num_closid);
 115
 116         closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 117
 118         /* CLOSID 0 is always reserved for the default group */
 119         closid_free_map &= ~1;
 120         closid_free_map_len = rdt_min_closid;
 121 }
 122
 123 static int closid_alloc(void)
 124 {
 125         u32 closid = ffs(closid_free_map);
 126
 127         if (closid == 0)
 128                 return -ENOSPC;
 129         closid--;
 130         closid_free_map &= ~(1 << closid);
 131
 132         return closid;
 133 }
 134
 135 void closid_free(int closid)
 136 {
 137         closid_free_map |= 1 << closid;
 138 }
 139
 140 /**
 141  * closid_allocated - test if provided closid is in use
 142  * @closid: closid to be tested
 143  *
 144  * Return: true if @closid is currently associated with a resource group,
 145  * false if @closid is free
 146  */
 147 static bool closid_allocated(unsigned int closid)
 148 {
 149         return (closid_free_map & (1 << closid)) == 0;
 150 }
 151
 152 /**
 153  * rdtgroup_mode_by_closid - Return mode of resource group with closid
 154  * @closid: closid if the resource group
 155  *
 156  * Each resource group is associated with a @closid. Here the mode
 157  * of a resource group can be queried by searching for it using its closid.
 158  *
 159  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 160  */
 161 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 162 {
 163         struct rdtgroup *rdtgrp;
 164
 165         list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 166                 if (rdtgrp->closid == closid)
 167                         return rdtgrp->mode;
 168         }
 169
 170         return RDT_NUM_MODES;
 171 }
 172
 173 static const char * const rdt_mode_str[] = {
 174         [RDT_MODE_SHAREABLE]            = "shareable",
 175         [RDT_MODE_EXCLUSIVE]            = "exclusive",
 176         [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
 177         [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
 178 };
 179
 180 /**
 181  * rdtgroup_mode_str - Return the string representation of mode
 182  * @mode: the resource group mode as &enum rdtgroup_mode
 183  *
 184  * Return: string representation of valid mode, "unknown" otherwise
 185  */
 186 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 187 {
 188         if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 189                 return "unknown";
 190
 191         return rdt_mode_str[mode];
 192 }
 193
 194 /* set uid and gid of rdtgroup dirs and files to that of the creator */
 195 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 196 {
 197         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 198                                 .ia_uid = current_fsuid(),
 199                                 .ia_gid = current_fsgid(), };
 200
 201         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 202             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 203                 return 0;
 204
 205         return kernfs_setattr(kn, &iattr);
 206 }
 207
 208 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 209 {
 210         struct kernfs_node *kn;
 211         int ret;
 212
 213         kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 214                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 215                                   0, rft->kf_ops, rft, NULL, NULL);
 216         if (IS_ERR(kn))
 217                 return PTR_ERR(kn);
 218
 219         ret = rdtgroup_kn_set_ugid(kn);
 220         if (ret) {
 221                 kernfs_remove(kn);
 222                 return ret;
 223         }
 224
 225         return 0;
 226 }
 227
 228 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 229 {
 230         struct kernfs_open_file *of = m->private;
 231         struct rftype *rft = of->kn->priv;
 232
 233         if (rft->seq_show)
 234                 return rft->seq_show(of, m, arg);
 235         return 0;
 236 }
 237
 238 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 239                                    size_t nbytes, loff_t off)
 240 {
 241         struct rftype *rft = of->kn->priv;
 242
 243         if (rft->write)
 244                 return rft->write(of, buf, nbytes, off);
 245
 246         return -EINVAL;
 247 }
 248
 249 static struct kernfs_ops rdtgroup_kf_single_ops = {
 250         .atomic_write_len       = PAGE_SIZE,
 251         .write                  = rdtgroup_file_write,
 252         .seq_show               = rdtgroup_seqfile_show,
 253 };
 254
 255 static struct kernfs_ops kf_mondata_ops = {
 256         .atomic_write_len       = PAGE_SIZE,
 257         .seq_show               = rdtgroup_mondata_show,
 258 };
 259
 260 static bool is_cpu_list(struct kernfs_open_file *of)
 261 {
 262         struct rftype *rft = of->kn->priv;
 263
 264         return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 265 }
 266
 267 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 268                               struct seq_file *s, void *v)
 269 {
 270         struct rdtgroup *rdtgrp;
 271         struct cpumask *mask;
 272         int ret = 0;
 273
 274         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 275
 276         if (rdtgrp) {
 277                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 278                         if (!rdtgrp->plr->d) {
 279                                 rdt_last_cmd_clear();
 280                                 rdt_last_cmd_puts("Cache domain offline\n");
 281                                 ret = -ENODEV;
 282                         } else {
 283                                 mask = &rdtgrp->plr->d->cpu_mask;
 284                                 seq_printf(s, is_cpu_list(of) ?
 285                                            "%*pbl\n" : "%*pb\n",
 286                                            cpumask_pr_args(mask));
 287                         }
 288                 } else {
 289                         seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 290                                    cpumask_pr_args(&rdtgrp->cpu_mask));
 291                 }
 292         } else {
 293                 ret = -ENOENT;
 294         }
 295         rdtgroup_kn_unlock(of->kn);
 296
 297         return ret;
 298 }
 299
 300 /*
 301  * This is safe against intel_rdt_sched_in() called from __switch_to()
 302  * because __switch_to() is executed with interrupts disabled. A local call
 303  * from update_closid_rmid() is proteced against __switch_to() because
 304  * preemption is disabled.
 305  */
 306 static void update_cpu_closid_rmid(void *info)
 307 {
 308         struct rdtgroup *r = info;
 309
 310         if (r) {
 311                 this_cpu_write(pqr_state.default_closid, r->closid);
 312                 this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 313         }
 314
 315         /*
 316          * We cannot unconditionally write the MSR because the current
 317          * executing task might have its own closid selected. Just reuse
 318          * the context switch code.
 319          */
 320         intel_rdt_sched_in();
 321 }
 322
 323 /*
 324  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 325  *
 326  * Per task closids/rmids must have been set up before calling this function.
 327  */
 328 static void
 329 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 330 {
 331         int cpu = get_cpu();
 332
 333         if (cpumask_test_cpu(cpu, cpu_mask))
 334                 update_cpu_closid_rmid(r);
 335         smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 336         put_cpu();
 337 }
 338
 339 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 340                           cpumask_var_t tmpmask)
 341 {
 342         struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 343         struct list_head *head;
 344
 345         /* Check whether cpus belong to parent ctrl group */
 346         cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 347         if (cpumask_weight(tmpmask)) {
 348                 rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
 349                 return -EINVAL;
 350         }
 351
 352         /* Check whether cpus are dropped from this group */
 353         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 354         if (cpumask_weight(tmpmask)) {
 355                 /* Give any dropped cpus to parent rdtgroup */
 356                 cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 357                 update_closid_rmid(tmpmask, prgrp);
 358         }
 359
 360         /*
 361          * If we added cpus, remove them from previous group that owned them
 362          * and update per-cpu rmid
 363          */
 364         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 365         if (cpumask_weight(tmpmask)) {
 366                 head = &prgrp->mon.crdtgrp_list;
 367                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 368                         if (crgrp == rdtgrp)
 369                                 continue;
 370                         cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 371                                        tmpmask);
 372                 }
 373                 update_closid_rmid(tmpmask, rdtgrp);
 374         }
 375
 376         /* Done pushing/pulling - update this group with new mask */
 377         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 378
 379         return 0;
 380 }
 381
 382 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 383 {
 384         struct rdtgroup *crgrp;
 385
 386         cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 387         /* update the child mon group masks as well*/
 388         list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 389                 cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 390 }
 391
 392 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 393                            cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 394 {
 395         struct rdtgroup *r, *crgrp;
 396         struct list_head *head;
 397
 398         /* Check whether cpus are dropped from this group */
 399         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 400         if (cpumask_weight(tmpmask)) {
 401                 /* Can't drop from default group */
 402                 if (rdtgrp == &rdtgroup_default) {
 403                         rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 404                         return -EINVAL;
 405                 }
 406
 407                 /* Give any dropped cpus to rdtgroup_default */
 408                 cpumask_or(&rdtgroup_default.cpu_mask,
 409                            &rdtgroup_default.cpu_mask, tmpmask);
 410                 update_closid_rmid(tmpmask, &rdtgroup_default);
 411         }
 412
 413         /*
 414          * If we added cpus, remove them from previous group and
 415          * the prev group's child groups that owned them
 416          * and update per-cpu closid/rmid.
 417          */
 418         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 419         if (cpumask_weight(tmpmask)) {
 420                 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 421                         if (r == rdtgrp)
 422                                 continue;
 423                         cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 424                         if (cpumask_weight(tmpmask1))
 425                                 cpumask_rdtgrp_clear(r, tmpmask1);
 426                 }
 427                 update_closid_rmid(tmpmask, rdtgrp);
 428         }
 429
 430         /* Done pushing/pulling - update this group with new mask */
 431         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 432
 433         /*
 434          * Clear child mon group masks since there is a new parent mask
 435          * now and update the rmid for the cpus the child lost.
 436          */
 437         head = &rdtgrp->mon.crdtgrp_list;
 438         list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 439                 cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 440                 update_closid_rmid(tmpmask, rdtgrp);
 441                 cpumask_clear(&crgrp->cpu_mask);
 442         }
 443
 444         return 0;
 445 }
 446
 447 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 448                                    char *buf, size_t nbytes, loff_t off)
 449 {
 450         cpumask_var_t tmpmask, newmask, tmpmask1;
 451         struct rdtgroup *rdtgrp;
 452         int ret;
 453
 454         if (!buf)
 455                 return -EINVAL;
 456
 457         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 458                 return -ENOMEM;
 459         if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 460                 free_cpumask_var(tmpmask);
 461                 return -ENOMEM;
 462         }
 463         if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 464                 free_cpumask_var(tmpmask);
 465                 free_cpumask_var(newmask);
 466                 return -ENOMEM;
 467         }
 468
 469         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 470         rdt_last_cmd_clear();
 471         if (!rdtgrp) {
 472                 ret = -ENOENT;
 473                 rdt_last_cmd_puts("directory was removed\n");
 474                 goto unlock;
 475         }
 476
 477         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 478             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 479                 ret = -EINVAL;
 480                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 481                 goto unlock;
 482         }
 483
 484         if (is_cpu_list(of))
 485                 ret = cpulist_parse(buf, newmask);
 486         else
 487                 ret = cpumask_parse(buf, newmask);
 488
 489         if (ret) {
 490                 rdt_last_cmd_puts("bad cpu list/mask\n");
 491                 goto unlock;
 492         }
 493
 494         /* check that user didn't specify any offline cpus */
 495         cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 496         if (cpumask_weight(tmpmask)) {
 497                 ret = -EINVAL;
 498                 rdt_last_cmd_puts("can only assign online cpus\n");
 499                 goto unlock;
 500         }
 501
 502         if (rdtgrp->type == RDTCTRL_GROUP)
 503                 ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 504         else if (rdtgrp->type == RDTMON_GROUP)
 505                 ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 506         else
 507                 ret = -EINVAL;
 508
 509 unlock:
 510         rdtgroup_kn_unlock(of->kn);
 511         free_cpumask_var(tmpmask);
 512         free_cpumask_var(newmask);
 513         free_cpumask_var(tmpmask1);
 514
 515         return ret ?: nbytes;
 516 }
 517
 518 struct task_move_callback {
 519         struct callback_head    work;
 520         struct rdtgroup         *rdtgrp;
 521 };
 522
 523 static void move_myself(struct callback_head *head)
 524 {
 525         struct task_move_callback *callback;
 526         struct rdtgroup *rdtgrp;
 527
 528         callback = container_of(head, struct task_move_callback, work);
 529         rdtgrp = callback->rdtgrp;
 530
 531         /*
 532          * If resource group was deleted before this task work callback
 533          * was invoked, then assign the task to root group and free the
 534          * resource group.
 535          */
 536         if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 537             (rdtgrp->flags & RDT_DELETED)) {
 538                 current->closid = 0;
 539                 current->rmid = 0;
 540                 kfree(rdtgrp);
 541         }
 542
 543         preempt_disable();
 544         /* update PQR_ASSOC MSR to make resource group go into effect */
 545         intel_rdt_sched_in();
 546         preempt_enable();
 547
 548         kfree(callback);
 549 }
 550
 551 static int __rdtgroup_move_task(struct task_struct *tsk,
 552                                 struct rdtgroup *rdtgrp)
 553 {
 554         struct task_move_callback *callback;
 555         int ret;
 556
 557         callback = kzalloc(sizeof(*callback), GFP_KERNEL);
 558         if (!callback)
 559                 return -ENOMEM;
 560         callback->work.func = move_myself;
 561         callback->rdtgrp = rdtgrp;
 562
 563         /*
 564          * Take a refcount, so rdtgrp cannot be freed before the
 565          * callback has been invoked.
 566          */
 567         atomic_inc(&rdtgrp->waitcount);
 568         ret = task_work_add(tsk, &callback->work, true);
 569         if (ret) {
 570                 /*
 571                  * Task is exiting. Drop the refcount and free the callback.
 572                  * No need to check the refcount as the group cannot be
 573                  * deleted before the write function unlocks rdtgroup_mutex.
 574                  */
 575                 atomic_dec(&rdtgrp->waitcount);
 576                 kfree(callback);
 577                 rdt_last_cmd_puts("task exited\n");
 578         } else {
 579                 /*
 580                  * For ctrl_mon groups move both closid and rmid.
 581                  * For monitor groups, can move the tasks only from
 582                  * their parent CTRL group.
 583                  */
 584                 if (rdtgrp->type == RDTCTRL_GROUP) {
 585                         tsk->closid = rdtgrp->closid;
 586                         tsk->rmid = rdtgrp->mon.rmid;
 587                 } else if (rdtgrp->type == RDTMON_GROUP) {
 588                         if (rdtgrp->mon.parent->closid == tsk->closid) {
 589                                 tsk->rmid = rdtgrp->mon.rmid;
 590                         } else {
 591                                 rdt_last_cmd_puts("Can't move task to different control group\n");
 592                                 ret = -EINVAL;
 593                         }
 594                 }
 595         }
 596         return ret;
 597 }
 598
 599 /**
 600  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 601  * @r: Resource group
 602  *
 603  * Return: 1 if tasks have been assigned to @r, 0 otherwise
 604  */
 605 int rdtgroup_tasks_assigned(struct rdtgroup *r)
 606 {
 607         struct task_struct *p, *t;
 608         int ret = 0;
 609
 610         lockdep_assert_held(&rdtgroup_mutex);
 611
 612         rcu_read_lock();
 613         for_each_process_thread(p, t) {
 614                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 615                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
 616                         ret = 1;
 617                         break;
 618                 }
 619         }
 620         rcu_read_unlock();
 621
 622         return ret;
 623 }
 624
 625 static int rdtgroup_task_write_permission(struct task_struct *task,
 626                                           struct kernfs_open_file *of)
 627 {
 628         const struct cred *tcred = get_task_cred(task);
 629         const struct cred *cred = current_cred();
 630         int ret = 0;
 631
 632         /*
 633          * Even if we're attaching all tasks in the thread group, we only
 634          * need to check permissions on one of them.
 635          */
 636         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 637             !uid_eq(cred->euid, tcred->uid) &&
 638             !uid_eq(cred->euid, tcred->suid)) {
 639                 rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 640                 ret = -EPERM;
 641         }
 642
 643         put_cred(tcred);
 644         return ret;
 645 }
 646
 647 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 648                               struct kernfs_open_file *of)
 649 {
 650         struct task_struct *tsk;
 651         int ret;
 652
 653         rcu_read_lock();
 654         if (pid) {
 655                 tsk = find_task_by_vpid(pid);
 656                 if (!tsk) {
 657                         rcu_read_unlock();
 658                         rdt_last_cmd_printf("No task %d\n", pid);
 659                         return -ESRCH;
 660                 }
 661         } else {
 662                 tsk = current;
 663         }
 664
 665         get_task_struct(tsk);
 666         rcu_read_unlock();
 667
 668         ret = rdtgroup_task_write_permission(tsk, of);
 669         if (!ret)
 670                 ret = __rdtgroup_move_task(tsk, rdtgrp);
 671
 672         put_task_struct(tsk);
 673         return ret;
 674 }
 675
 676 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 677                                     char *buf, size_t nbytes, loff_t off)
 678 {
 679         struct rdtgroup *rdtgrp;
 680         int ret = 0;
 681         pid_t pid;
 682
 683         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 684                 return -EINVAL;
 685         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 686         if (!rdtgrp) {
 687                 rdtgroup_kn_unlock(of->kn);
 688                 return -ENOENT;
 689         }
 690         rdt_last_cmd_clear();
 691
 692         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 693             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 694                 ret = -EINVAL;
 695                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 696                 goto unlock;
 697         }
 698
 699         ret = rdtgroup_move_task(pid, rdtgrp, of);
 700
 701 unlock:
 702         rdtgroup_kn_unlock(of->kn);
 703
 704         return ret ?: nbytes;
 705 }
 706
 707 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 708 {
 709         struct task_struct *p, *t;
 710
 711         rcu_read_lock();
 712         for_each_process_thread(p, t) {
 713                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 714                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 715                         seq_printf(s, "%d\n", t->pid);
 716         }
 717         rcu_read_unlock();
 718 }
 719
 720 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 721                                struct seq_file *s, void *v)
 722 {
 723         struct rdtgroup *rdtgrp;
 724         int ret = 0;
 725
 726         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 727         if (rdtgrp)
 728                 show_rdt_tasks(rdtgrp, s);
 729         else
 730                 ret = -ENOENT;
 731         rdtgroup_kn_unlock(of->kn);
 732
 733         return ret;
 734 }
 735
 736 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 737                                     struct seq_file *seq, void *v)
 738 {
 739         int len;
 740
 741         mutex_lock(&rdtgroup_mutex);
 742         len = seq_buf_used(&last_cmd_status);
 743         if (len)
 744                 seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 745         else
 746                 seq_puts(seq, "ok\n");
 747         mutex_unlock(&rdtgroup_mutex);
 748         return 0;
 749 }
 750
 751 static int rdt_num_closids_show(struct kernfs_open_file *of,
 752                                 struct seq_file *seq, void *v)
 753 {
 754         struct rdt_resource *r = of->kn->parent->priv;
 755
 756         seq_printf(seq, "%d\n", r->num_closid);
 757         return 0;
 758 }
 759
 760 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 761                              struct seq_file *seq, void *v)
 762 {
 763         struct rdt_resource *r = of->kn->parent->priv;
 764
 765         seq_printf(seq, "%x\n", r->default_ctrl);
 766         return 0;
 767 }
 768
 769 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 770                              struct seq_file *seq, void *v)
 771 {
 772         struct rdt_resource *r = of->kn->parent->priv;
 773
 774         seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 775         return 0;
 776 }
 777
 778 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 779                                    struct seq_file *seq, void *v)
 780 {
 781         struct rdt_resource *r = of->kn->parent->priv;
 782
 783         seq_printf(seq, "%x\n", r->cache.shareable_bits);
 784         return 0;
 785 }
 786
 787 /**
 788  * rdt_bit_usage_show - Display current usage of resources
 789  *
 790  * A domain is a shared resource that can now be allocated differently. Here
 791  * we display the current regions of the domain as an annotated bitmask.
 792  * For each domain of this resource its allocation bitmask
 793  * is annotated as below to indicate the current usage of the corresponding bit:
 794  *   0 - currently unused
 795  *   X - currently available for sharing and used by software and hardware
 796  *   H - currently used by hardware only but available for software use
 797  *   S - currently used and shareable by software only
 798  *   E - currently used exclusively by one resource group
 799  *   P - currently pseudo-locked by one resource group
 800  */
 801 static int rdt_bit_usage_show(struct kernfs_open_file *of,
 802                               struct seq_file *seq, void *v)
 803 {
 804         struct rdt_resource *r = of->kn->parent->priv;
 805         u32 sw_shareable = 0, hw_shareable = 0;
 806         u32 exclusive = 0, pseudo_locked = 0;
 807         struct rdt_domain *dom;
 808         int i, hwb, swb, excl, psl;
 809         enum rdtgrp_mode mode;
 810         bool sep = false;
 811         u32 *ctrl;
 812
 813         mutex_lock(&rdtgroup_mutex);
 814         hw_shareable = r->cache.shareable_bits;
 815         list_for_each_entry(dom, &r->domains, list) {
 816                 if (sep)
 817                         seq_putc(seq, ';');
 818                 ctrl = dom->ctrl_val;
 819                 sw_shareable = 0;
 820                 exclusive = 0;
 821                 seq_printf(seq, "%d=", dom->id);
 822                 for (i = 0; i < closids_supported(); i++, ctrl++) {
 823                         if (!closid_allocated(i))
 824                                 continue;
 825                         mode = rdtgroup_mode_by_closid(i);
 826                         switch (mode) {
 827                         case RDT_MODE_SHAREABLE:
 828                                 sw_shareable |= *ctrl;
 829                                 break;
 830                         case RDT_MODE_EXCLUSIVE:
 831                                 exclusive |= *ctrl;
 832                                 break;
 833                         case RDT_MODE_PSEUDO_LOCKSETUP:
 834                         /*
 835                          * RDT_MODE_PSEUDO_LOCKSETUP is possible
 836                          * here but not included since the CBM
 837                          * associated with this CLOSID in this mode
 838                          * is not initialized and no task or cpu can be
 839                          * assigned this CLOSID.
 840                          */
 841                                 break;
 842                         case RDT_MODE_PSEUDO_LOCKED:
 843                         case RDT_NUM_MODES:
 844                                 WARN(1,
 845                                      "invalid mode for closid %d\n", i);
 846                                 break;
 847                         }
 848                 }
 849                 for (i = r->cache.cbm_len - 1; i >= 0; i--) {
 850                         pseudo_locked = dom->plr ? dom->plr->cbm : 0;
 851                         hwb = test_bit(i, (unsigned long *)&hw_shareable);
 852                         swb = test_bit(i, (unsigned long *)&sw_shareable);
 853                         excl = test_bit(i, (unsigned long *)&exclusive);
 854                         psl = test_bit(i, (unsigned long *)&pseudo_locked);
 855                         if (hwb && swb)
 856                                 seq_putc(seq, 'X');
 857                         else if (hwb && !swb)
 858                                 seq_putc(seq, 'H');
 859                         else if (!hwb && swb)
 860                                 seq_putc(seq, 'S');
 861                         else if (excl)
 862                                 seq_putc(seq, 'E');
 863                         else if (psl)
 864                                 seq_putc(seq, 'P');
 865                         else /* Unused bits remain */
 866                                 seq_putc(seq, '0');
 867                 }
 868                 sep = true;
 869         }
 870         seq_putc(seq, '\n');
 871         mutex_unlock(&rdtgroup_mutex);
 872         return 0;
 873 }
 874
 875 static int rdt_min_bw_show(struct kernfs_open_file *of,
 876                              struct seq_file *seq, void *v)
 877 {
 878         struct rdt_resource *r = of->kn->parent->priv;
 879
 880         seq_printf(seq, "%u\n", r->membw.min_bw);
 881         return 0;
 882 }
 883
 884 static int rdt_num_rmids_show(struct kernfs_open_file *of,
 885                               struct seq_file *seq, void *v)
 886 {
 887         struct rdt_resource *r = of->kn->parent->priv;
 888
 889         seq_printf(seq, "%d\n", r->num_rmid);
 890
 891         return 0;
 892 }
 893
 894 static int rdt_mon_features_show(struct kernfs_open_file *of,
 895                                  struct seq_file *seq, void *v)
 896 {
 897         struct rdt_resource *r = of->kn->parent->priv;
 898         struct mon_evt *mevt;
 899
 900         list_for_each_entry(mevt, &r->evt_list, list)
 901                 seq_printf(seq, "%s\n", mevt->name);
 902
 903         return 0;
 904 }
 905
 906 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 907                              struct seq_file *seq, void *v)
 908 {
 909         struct rdt_resource *r = of->kn->parent->priv;
 910
 911         seq_printf(seq, "%u\n", r->membw.bw_gran);
 912         return 0;
 913 }
 914
 915 static int rdt_delay_linear_show(struct kernfs_open_file *of,
 916                              struct seq_file *seq, void *v)
 917 {
 918         struct rdt_resource *r = of->kn->parent->priv;
 919
 920         seq_printf(seq, "%u\n", r->membw.delay_linear);
 921         return 0;
 922 }
 923
 924 static int max_threshold_occ_show(struct kernfs_open_file *of,
 925                                   struct seq_file *seq, void *v)
 926 {
 927         struct rdt_resource *r = of->kn->parent->priv;
 928
 929         seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
 930
 931         return 0;
 932 }
 933
 934 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 935                                        char *buf, size_t nbytes, loff_t off)
 936 {
 937         struct rdt_resource *r = of->kn->parent->priv;
 938         unsigned int bytes;
 939         int ret;
 940
 941         ret = kstrtouint(buf, 0, &bytes);
 942         if (ret)
 943                 return ret;
 944
 945         if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 946                 return -EINVAL;
 947
 948         intel_cqm_threshold = bytes / r->mon_scale;
 949
 950         return nbytes;
 951 }
 952
 953 /*
 954  * rdtgroup_mode_show - Display mode of this resource group
 955  */
 956 static int rdtgroup_mode_show(struct kernfs_open_file *of,
 957                               struct seq_file *s, void *v)
 958 {
 959         struct rdtgroup *rdtgrp;
 960
 961         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 962         if (!rdtgrp) {
 963                 rdtgroup_kn_unlock(of->kn);
 964                 return -ENOENT;
 965         }
 966
 967         seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
 968
 969         rdtgroup_kn_unlock(of->kn);
 970         return 0;
 971 }
 972
 973 /**
 974  * rdt_cdp_peer_get - Retrieve CDP peer if it exists
 975  * @r: RDT resource to which RDT domain @d belongs
 976  * @d: Cache instance for which a CDP peer is requested
 977  * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
 978  *         Used to return the result.
 979  * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
 980  *         Used to return the result.
 981  *
 982  * RDT resources are managed independently and by extension the RDT domains
 983  * (RDT resource instances) are managed independently also. The Code and
 984  * Data Prioritization (CDP) RDT resources, while managed independently,
 985  * could refer to the same underlying hardware. For example,
 986  * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
 987  *
 988  * When provided with an RDT resource @r and an instance of that RDT
 989  * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
 990  * resource and the exact instance that shares the same hardware.
 991  *
 992  * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
 993  *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
 994  *         and @d_cdp will point to the peer RDT domain.
 995  */
 996 static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
 997                             struct rdt_resource **r_cdp,
 998                             struct rdt_domain **d_cdp)
 999 {
1000         struct rdt_resource *_r_cdp = NULL;
1001         struct rdt_domain *_d_cdp = NULL;
1002         int ret = 0;
1003
1004         switch (r->rid) {
1005         case RDT_RESOURCE_L3DATA:
1006                 _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
1007                 break;
1008         case RDT_RESOURCE_L3CODE:
1009                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
1010                 break;
1011         case RDT_RESOURCE_L2DATA:
1012                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
1013                 break;
1014         case RDT_RESOURCE_L2CODE:
1015                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
1016                 break;
1017         default:
1018                 ret = -ENOENT;
1019                 goto out;
1020         }
1021
1022         /*
1023          * When a new CPU comes online and CDP is enabled then the new
1024          * RDT domains (if any) associated with both CDP RDT resources
1025          * are added in the same CPU online routine while the
1026          * rdtgroup_mutex is held. It should thus not happen for one
1027          * RDT domain to exist and be associated with its RDT CDP
1028          * resource but there is no RDT domain associated with the
1029          * peer RDT CDP resource. Hence the WARN.
1030          */
1031         _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
1032         if (WARN_ON(!_d_cdp)) {
1033                 _r_cdp = NULL;
1034                 ret = -EINVAL;
1035         }
1036
1037 out:
1038         *r_cdp = _r_cdp;
1039         *d_cdp = _d_cdp;
1040
1041         return ret;
1042 }
1043
1044 /**
1045  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1046  * @r: Resource to which domain instance @d belongs.
1047  * @d: The domain instance for which @closid is being tested.
1048  * @cbm: Capacity bitmask being tested.
1049  * @closid: Intended closid for @cbm.
1050  * @exclusive: Only check if overlaps with exclusive resource groups
1051  *
1052  * Checks if provided @cbm intended to be used for @closid on domain
1053  * @d overlaps with any other closids or other hardware usage associated
1054  * with this domain. If @exclusive is true then only overlaps with
1055  * resource groups in exclusive mode will be considered. If @exclusive
1056  * is false then overlaps with any resource group or hardware entities
1057  * will be considered.
1058  *
1059  * @cbm is unsigned long, even if only 32 bits are used, to make the
1060  * bitmap functions work correctly.
1061  *
1062  * Return: false if CBM does not overlap, true if it does.
1063  */
1064 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1065                                     unsigned long cbm, int closid, bool exclusive)
1066 {
1067         enum rdtgrp_mode mode;
1068         unsigned long ctrl_b;
1069         u32 *ctrl;
1070         int i;
1071
1072         /* Check for any overlap with regions used by hardware directly */
1073         if (!exclusive) {
1074                 ctrl_b = r->cache.shareable_bits;
1075                 if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1076                         return true;
1077         }
1078
1079         /* Check for overlap with other resource groups */
1080         ctrl = d->ctrl_val;
1081         for (i = 0; i < closids_supported(); i++, ctrl++) {
1082                 ctrl_b = *ctrl;
1083                 mode = rdtgroup_mode_by_closid(i);
1084                 if (closid_allocated(i) && i != closid &&
1085                     mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1086                         if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1087                                 if (exclusive) {
1088                                         if (mode == RDT_MODE_EXCLUSIVE)
1089                                                 return true;
1090                                         continue;
1091                                 }
1092                                 return true;
1093                         }
1094                 }
1095         }
1096
1097         return false;
1098 }
1099
1100 /**
1101  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1102  * @r: Resource to which domain instance @d belongs.
1103  * @d: The domain instance for which @closid is being tested.
1104  * @cbm: Capacity bitmask being tested.
1105  * @closid: Intended closid for @cbm.
1106  * @exclusive: Only check if overlaps with exclusive resource groups
1107  *
1108  * Resources that can be allocated using a CBM can use the CBM to control
1109  * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1110  * for overlap. Overlap test is not limited to the specific resource for
1111  * which the CBM is intended though - when dealing with CDP resources that
1112  * share the underlying hardware the overlap check should be performed on
1113  * the CDP resource sharing the hardware also.
1114  *
1115  * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1116  * overlap test.
1117  *
1118  * Return: true if CBM overlap detected, false if there is no overlap
1119  */
1120 bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1121                            unsigned long cbm, int closid, bool exclusive)
1122 {
1123         struct rdt_resource *r_cdp;
1124         struct rdt_domain *d_cdp;
1125
1126         if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
1127                 return true;
1128
1129         if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
1130                 return false;
1131
1132         return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
1133 }
1134
1135 /**
1136  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1137  *
1138  * An exclusive resource group implies that there should be no sharing of
1139  * its allocated resources. At the time this group is considered to be
1140  * exclusive this test can determine if its current schemata supports this
1141  * setting by testing for overlap with all other resource groups.
1142  *
1143  * Return: true if resource group can be exclusive, false if there is overlap
1144  * with allocations of other resource groups and thus this resource group
1145  * cannot be exclusive.
1146  */
1147 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1148 {
1149         int closid = rdtgrp->closid;
1150         struct rdt_resource *r;
1151         bool has_cache = false;
1152         struct rdt_domain *d;
1153
1154         for_each_alloc_enabled_rdt_resource(r) {
1155                 if (r->rid == RDT_RESOURCE_MBA)
1156                         continue;
1157                 has_cache = true;
1158                 list_for_each_entry(d, &r->domains, list) {
1159                         if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
1160                                                   rdtgrp->closid, false)) {
1161                                 rdt_last_cmd_puts("schemata overlaps\n");
1162                                 return false;
1163                         }
1164                 }
1165         }
1166
1167         if (!has_cache) {
1168                 rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
1169                 return false;
1170         }
1171
1172         return true;
1173 }
1174
1175 /**
1176  * rdtgroup_mode_write - Modify the resource group's mode
1177  *
1178  */
1179 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1180                                    char *buf, size_t nbytes, loff_t off)
1181 {
1182         struct rdtgroup *rdtgrp;
1183         enum rdtgrp_mode mode;
1184         int ret = 0;
1185
1186         /* Valid input requires a trailing newline */
1187         if (nbytes == 0 || buf[nbytes - 1] != '\n')
1188                 return -EINVAL;
1189         buf[nbytes - 1] = '\0';
1190
1191         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1192         if (!rdtgrp) {
1193                 rdtgroup_kn_unlock(of->kn);
1194                 return -ENOENT;
1195         }
1196
1197         rdt_last_cmd_clear();
1198
1199         mode = rdtgrp->mode;
1200
1201         if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1202             (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1203             (!strcmp(buf, "pseudo-locksetup") &&
1204              mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1205             (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1206                 goto out;
1207
1208         if (mode == RDT_MODE_PSEUDO_LOCKED) {
1209                 rdt_last_cmd_printf("cannot change pseudo-locked group\n");
1210                 ret = -EINVAL;
1211                 goto out;
1212         }
1213
1214         if (!strcmp(buf, "shareable")) {
1215                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1216                         ret = rdtgroup_locksetup_exit(rdtgrp);
1217                         if (ret)
1218                                 goto out;
1219                 }
1220                 rdtgrp->mode = RDT_MODE_SHAREABLE;
1221         } else if (!strcmp(buf, "exclusive")) {
1222                 if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1223                         ret = -EINVAL;
1224                         goto out;
1225                 }
1226                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1227                         ret = rdtgroup_locksetup_exit(rdtgrp);
1228                         if (ret)
1229                                 goto out;
1230                 }
1231                 rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1232         } else if (!strcmp(buf, "pseudo-locksetup")) {
1233                 ret = rdtgroup_locksetup_enter(rdtgrp);
1234                 if (ret)
1235                         goto out;
1236                 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1237         } else {
1238                 rdt_last_cmd_printf("unknown/unsupported mode\n");
1239                 ret = -EINVAL;
1240         }
1241
1242 out:
1243         rdtgroup_kn_unlock(of->kn);
1244         return ret ?: nbytes;
1245 }
1246
1247 /**
1248  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1249  * @r: RDT resource to which @d belongs.
1250  * @d: RDT domain instance.
1251  * @cbm: bitmask for which the size should be computed.
1252  *
1253  * The bitmask provided associated with the RDT domain instance @d will be
1254  * translated into how many bytes it represents. The size in bytes is
1255  * computed by first dividing the total cache size by the CBM length to
1256  * determine how many bytes each bit in the bitmask represents. The result
1257  * is multiplied with the number of bits set in the bitmask.
1258  *
1259  * @cbm is unsigned long, even if only 32 bits are used to make the
1260  * bitmap functions work correctly.
1261  */
1262 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1263                                   struct rdt_domain *d, unsigned long cbm)
1264 {
1265         struct cpu_cacheinfo *ci;
1266         unsigned int size = 0;
1267         int num_b, i;
1268
1269         num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1270         ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1271         for (i = 0; i < ci->num_leaves; i++) {
1272                 if (ci->info_list[i].level == r->cache_level) {
1273                         size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1274                         break;
1275                 }
1276         }
1277
1278         return size;
1279 }
1280
1281 /**
1282  * rdtgroup_size_show - Display size in bytes of allocated regions
1283  *
1284  * The "size" file mirrors the layout of the "schemata" file, printing the
1285  * size in bytes of each region instead of the capacity bitmask.
1286  *
1287  */
1288 static int rdtgroup_size_show(struct kernfs_open_file *of,
1289                               struct seq_file *s, void *v)
1290 {
1291         struct rdtgroup *rdtgrp;
1292         struct rdt_resource *r;
1293         struct rdt_domain *d;
1294         unsigned int size;
1295         int ret = 0;
1296         bool sep;
1297         u32 ctrl;
1298
1299         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1300         if (!rdtgrp) {
1301                 rdtgroup_kn_unlock(of->kn);
1302                 return -ENOENT;
1303         }
1304
1305         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1306                 if (!rdtgrp->plr->d) {
1307                         rdt_last_cmd_clear();
1308                         rdt_last_cmd_puts("Cache domain offline\n");
1309                         ret = -ENODEV;
1310                 } else {
1311                         seq_printf(s, "%*s:", max_name_width,
1312                                    rdtgrp->plr->r->name);
1313                         size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
1314                                                     rdtgrp->plr->d,
1315                                                     rdtgrp->plr->cbm);
1316                         seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
1317                 }
1318                 goto out;
1319         }
1320
1321         for_each_alloc_enabled_rdt_resource(r) {
1322                 sep = false;
1323                 seq_printf(s, "%*s:", max_name_width, r->name);
1324                 list_for_each_entry(d, &r->domains, list) {
1325                         if (sep)
1326                                 seq_putc(s, ';');
1327                         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1328                                 size = 0;
1329                         } else {
1330                                 ctrl = (!is_mba_sc(r) ?
1331                                                 d->ctrl_val[rdtgrp->closid] :
1332                                                 d->mbps_val[rdtgrp->closid]);
1333                                 if (r->rid == RDT_RESOURCE_MBA)
1334                                         size = ctrl;
1335                                 else
1336                                         size = rdtgroup_cbm_to_size(r, d, ctrl);
1337                         }
1338                         seq_printf(s, "%d=%u", d->id, size);
1339                         sep = true;
1340                 }
1341                 seq_putc(s, '\n');
1342         }
1343
1344 out:
1345         rdtgroup_kn_unlock(of->kn);
1346
1347         return ret;
1348 }
1349
1350 /* rdtgroup information files for one cache resource. */
1351 static struct rftype res_common_files[] = {
1352         {
1353                 .name           = "last_cmd_status",
1354                 .mode           = 0444,
1355                 .kf_ops         = &rdtgroup_kf_single_ops,
1356                 .seq_show       = rdt_last_cmd_status_show,
1357                 .fflags         = RF_TOP_INFO,
1358         },
1359         {
1360                 .name           = "num_closids",
1361                 .mode           = 0444,
1362                 .kf_ops         = &rdtgroup_kf_single_ops,
1363                 .seq_show       = rdt_num_closids_show,
1364                 .fflags         = RF_CTRL_INFO,
1365         },
1366         {
1367                 .name           = "mon_features",
1368                 .mode           = 0444,
1369                 .kf_ops         = &rdtgroup_kf_single_ops,
1370                 .seq_show       = rdt_mon_features_show,
1371                 .fflags         = RF_MON_INFO,
1372         },
1373         {
1374                 .name           = "num_rmids",
1375                 .mode           = 0444,
1376                 .kf_ops         = &rdtgroup_kf_single_ops,
1377                 .seq_show       = rdt_num_rmids_show,
1378                 .fflags         = RF_MON_INFO,
1379         },
1380         {
1381                 .name           = "cbm_mask",
1382                 .mode           = 0444,
1383                 .kf_ops         = &rdtgroup_kf_single_ops,
1384                 .seq_show       = rdt_default_ctrl_show,
1385                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1386         },
1387         {
1388                 .name           = "min_cbm_bits",
1389                 .mode           = 0444,
1390                 .kf_ops         = &rdtgroup_kf_single_ops,
1391                 .seq_show       = rdt_min_cbm_bits_show,
1392                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1393         },
1394         {
1395                 .name           = "shareable_bits",
1396                 .mode           = 0444,
1397                 .kf_ops         = &rdtgroup_kf_single_ops,
1398                 .seq_show       = rdt_shareable_bits_show,
1399                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1400         },
1401         {
1402                 .name           = "bit_usage",
1403                 .mode           = 0444,
1404                 .kf_ops         = &rdtgroup_kf_single_ops,
1405                 .seq_show       = rdt_bit_usage_show,
1406                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1407         },
1408         {
1409                 .name           = "min_bandwidth",
1410                 .mode           = 0444,
1411                 .kf_ops         = &rdtgroup_kf_single_ops,
1412                 .seq_show       = rdt_min_bw_show,
1413                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1414         },
1415         {
1416                 .name           = "bandwidth_gran",
1417                 .mode           = 0444,
1418                 .kf_ops         = &rdtgroup_kf_single_ops,
1419                 .seq_show       = rdt_bw_gran_show,
1420                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1421         },
1422         {
1423                 .name           = "delay_linear",
1424                 .mode           = 0444,
1425                 .kf_ops         = &rdtgroup_kf_single_ops,
1426                 .seq_show       = rdt_delay_linear_show,
1427                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1428         },
1429         {
1430                 .name           = "max_threshold_occupancy",
1431                 .mode           = 0644,
1432                 .kf_ops         = &rdtgroup_kf_single_ops,
1433                 .write          = max_threshold_occ_write,
1434                 .seq_show       = max_threshold_occ_show,
1435                 .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
1436         },
1437         {
1438                 .name           = "cpus",
1439                 .mode           = 0644,
1440                 .kf_ops         = &rdtgroup_kf_single_ops,
1441                 .write          = rdtgroup_cpus_write,
1442                 .seq_show       = rdtgroup_cpus_show,
1443                 .fflags         = RFTYPE_BASE,
1444         },
1445         {
1446                 .name           = "cpus_list",
1447                 .mode           = 0644,
1448                 .kf_ops         = &rdtgroup_kf_single_ops,
1449                 .write          = rdtgroup_cpus_write,
1450                 .seq_show       = rdtgroup_cpus_show,
1451                 .flags          = RFTYPE_FLAGS_CPUS_LIST,
1452                 .fflags         = RFTYPE_BASE,
1453         },
1454         {
1455                 .name           = "tasks",
1456                 .mode           = 0644,
1457                 .kf_ops         = &rdtgroup_kf_single_ops,
1458                 .write          = rdtgroup_tasks_write,
1459                 .seq_show       = rdtgroup_tasks_show,
1460                 .fflags         = RFTYPE_BASE,
1461         },
1462         {
1463                 .name           = "schemata",
1464                 .mode           = 0644,
1465                 .kf_ops         = &rdtgroup_kf_single_ops,
1466                 .write          = rdtgroup_schemata_write,
1467                 .seq_show       = rdtgroup_schemata_show,
1468                 .fflags         = RF_CTRL_BASE,
1469         },
1470         {
1471                 .name           = "mode",
1472                 .mode           = 0644,
1473                 .kf_ops         = &rdtgroup_kf_single_ops,
1474                 .write          = rdtgroup_mode_write,
1475                 .seq_show       = rdtgroup_mode_show,
1476                 .fflags         = RF_CTRL_BASE,
1477         },
1478         {
1479                 .name           = "size",
1480                 .mode           = 0444,
1481                 .kf_ops         = &rdtgroup_kf_single_ops,
1482                 .seq_show       = rdtgroup_size_show,
1483                 .fflags         = RF_CTRL_BASE,
1484         },
1485
1486 };
1487
1488 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1489 {
1490         struct rftype *rfts, *rft;
1491         int ret, len;
1492
1493         rfts = res_common_files;
1494         len = ARRAY_SIZE(res_common_files);
1495
1496         lockdep_assert_held(&rdtgroup_mutex);
1497
1498         for (rft = rfts; rft < rfts + len; rft++) {
1499                 if ((fflags & rft->fflags) == rft->fflags) {
1500                         ret = rdtgroup_add_file(kn, rft);
1501                         if (ret)
1502                                 goto error;
1503                 }
1504         }
1505
1506         return 0;
1507 error:
1508         pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
1509         while (--rft >= rfts) {
1510                 if ((fflags & rft->fflags) == rft->fflags)
1511                         kernfs_remove_by_name(kn, rft->name);
1512         }
1513         return ret;
1514 }
1515
1516 /**
1517  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
1518  * @r: The resource group with which the file is associated.
1519  * @name: Name of the file
1520  *
1521  * The permissions of named resctrl file, directory, or link are modified
1522  * to not allow read, write, or execute by any user.
1523  *
1524  * WARNING: This function is intended to communicate to the user that the
1525  * resctrl file has been locked down - that it is not relevant to the
1526  * particular state the system finds itself in. It should not be relied
1527  * on to protect from user access because after the file's permissions
1528  * are restricted the user can still change the permissions using chmod
1529  * from the command line.
1530  *
1531  * Return: 0 on success, <0 on failure.
1532  */
1533 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
1534 {
1535         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1536         struct kernfs_node *kn;
1537         int ret = 0;
1538
1539         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1540         if (!kn)
1541                 return -ENOENT;
1542
1543         switch (kernfs_type(kn)) {
1544         case KERNFS_DIR:
1545                 iattr.ia_mode = S_IFDIR;
1546                 break;
1547         case KERNFS_FILE:
1548                 iattr.ia_mode = S_IFREG;
1549                 break;
1550         case KERNFS_LINK:
1551                 iattr.ia_mode = S_IFLNK;
1552                 break;
1553         }
1554
1555         ret = kernfs_setattr(kn, &iattr);
1556         kernfs_put(kn);
1557         return ret;
1558 }
1559
1560 /**
1561  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
1562  * @r: The resource group with which the file is associated.
1563  * @name: Name of the file
1564  * @mask: Mask of permissions that should be restored
1565  *
1566  * Restore the permissions of the named file. If @name is a directory the
1567  * permissions of its parent will be used.
1568  *
1569  * Return: 0 on success, <0 on failure.
1570  */
1571 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
1572                              umode_t mask)
1573 {
1574         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1575         struct kernfs_node *kn, *parent;
1576         struct rftype *rfts, *rft;
1577         int ret, len;
1578
1579         rfts = res_common_files;
1580         len = ARRAY_SIZE(res_common_files);
1581
1582         for (rft = rfts; rft < rfts + len; rft++) {
1583                 if (!strcmp(rft->name, name))
1584                         iattr.ia_mode = rft->mode & mask;
1585         }
1586
1587         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1588         if (!kn)
1589                 return -ENOENT;
1590
1591         switch (kernfs_type(kn)) {
1592         case KERNFS_DIR:
1593                 parent = kernfs_get_parent(kn);
1594                 if (parent) {
1595                         iattr.ia_mode |= parent->mode;
1596                         kernfs_put(parent);
1597                 }
1598                 iattr.ia_mode |= S_IFDIR;
1599                 break;
1600         case KERNFS_FILE:
1601                 iattr.ia_mode |= S_IFREG;
1602                 break;
1603         case KERNFS_LINK:
1604                 iattr.ia_mode |= S_IFLNK;
1605                 break;
1606         }
1607
1608         ret = kernfs_setattr(kn, &iattr);
1609         kernfs_put(kn);
1610         return ret;
1611 }
1612
1613 static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
1614                                       unsigned long fflags)
1615 {
1616         struct kernfs_node *kn_subdir;
1617         int ret;
1618
1619         kn_subdir = kernfs_create_dir(kn_info, name,
1620                                       kn_info->mode, r);
1621         if (IS_ERR(kn_subdir))
1622                 return PTR_ERR(kn_subdir);
1623
1624         kernfs_get(kn_subdir);
1625         ret = rdtgroup_kn_set_ugid(kn_subdir);
1626         if (ret)
1627                 return ret;
1628
1629         ret = rdtgroup_add_files(kn_subdir, fflags);
1630         if (!ret)
1631                 kernfs_activate(kn_subdir);
1632
1633         return ret;
1634 }
1635
1636 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
1637 {
1638         struct rdt_resource *r;
1639         unsigned long fflags;
1640         char name[32];
1641         int ret;
1642
1643         /* create the directory */
1644         kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
1645         if (IS_ERR(kn_info))
1646                 return PTR_ERR(kn_info);
1647         kernfs_get(kn_info);
1648
1649         ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
1650         if (ret)
1651                 goto out_destroy;
1652
1653         for_each_alloc_enabled_rdt_resource(r) {
1654                 fflags =  r->fflags | RF_CTRL_INFO;
1655                 ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
1656                 if (ret)
1657                         goto out_destroy;
1658         }
1659
1660         for_each_mon_enabled_rdt_resource(r) {
1661                 fflags =  r->fflags | RF_MON_INFO;
1662                 sprintf(name, "%s_MON", r->name);
1663                 ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
1664                 if (ret)
1665                         goto out_destroy;
1666         }
1667
1668         /*
1669          * This extra ref will be put in kernfs_remove() and guarantees
1670          * that @rdtgrp->kn is always accessible.
1671          */
1672         kernfs_get(kn_info);
1673
1674         ret = rdtgroup_kn_set_ugid(kn_info);
1675         if (ret)
1676                 goto out_destroy;
1677
1678         kernfs_activate(kn_info);
1679
1680         return 0;
1681
1682 out_destroy:
1683         kernfs_remove(kn_info);
1684         return ret;
1685 }
1686
1687 static int
1688 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
1689                     char *name, struct kernfs_node **dest_kn)
1690 {
1691         struct kernfs_node *kn;
1692         int ret;
1693
1694         /* create the directory */
1695         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1696         if (IS_ERR(kn))
1697                 return PTR_ERR(kn);
1698
1699         if (dest_kn)
1700                 *dest_kn = kn;
1701
1702         /*
1703          * This extra ref will be put in kernfs_remove() and guarantees
1704          * that @rdtgrp->kn is always accessible.
1705          */
1706         kernfs_get(kn);
1707
1708         ret = rdtgroup_kn_set_ugid(kn);
1709         if (ret)
1710                 goto out_destroy;
1711
1712         kernfs_activate(kn);
1713
1714         return 0;
1715
1716 out_destroy:
1717         kernfs_remove(kn);
1718         return ret;
1719 }
1720
1721 static void l3_qos_cfg_update(void *arg)
1722 {
1723         bool *enable = arg;
1724
1725         wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
1726 }
1727
1728 static void l2_qos_cfg_update(void *arg)
1729 {
1730         bool *enable = arg;
1731
1732         wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1733 }
1734
1735 static inline bool is_mba_linear(void)
1736 {
1737         return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1738 }
1739
1740 static int set_cache_qos_cfg(int level, bool enable)
1741 {
1742         void (*update)(void *arg);
1743         struct rdt_resource *r_l;
1744         cpumask_var_t cpu_mask;
1745         struct rdt_domain *d;
1746         int cpu;
1747
1748         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1749                 return -ENOMEM;
1750
1751         if (level == RDT_RESOURCE_L3)
1752                 update = l3_qos_cfg_update;
1753         else if (level == RDT_RESOURCE_L2)
1754                 update = l2_qos_cfg_update;
1755         else
1756                 return -EINVAL;
1757
1758         r_l = &rdt_resources_all[level];
1759         list_for_each_entry(d, &r_l->domains, list) {
1760                 /* Pick one CPU from each domain instance to update MSR */
1761                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1762         }
1763         cpu = get_cpu();
1764         /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
1765         if (cpumask_test_cpu(cpu, cpu_mask))
1766                 update(&enable);
1767         /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
1768         smp_call_function_many(cpu_mask, update, &enable, 1);
1769         put_cpu();
1770
1771         free_cpumask_var(cpu_mask);
1772
1773         return 0;
1774 }
1775
1776 /*
1777  * Enable or disable the MBA software controller
1778  * which helps user specify bandwidth in MBps.
1779  * MBA software controller is supported only if
1780  * MBM is supported and MBA is in linear scale.
1781  */
1782 static int set_mba_sc(bool mba_sc)
1783 {
1784         struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1785         struct rdt_domain *d;
1786
1787         if (!is_mbm_enabled() || !is_mba_linear() ||
1788             mba_sc == is_mba_sc(r))
1789                 return -EINVAL;
1790
1791         r->membw.mba_sc = mba_sc;
1792         list_for_each_entry(d, &r->domains, list)
1793                 setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1794
1795         return 0;
1796 }
1797
1798 static int cdp_enable(int level, int data_type, int code_type)
1799 {
1800         struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
1801         struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
1802         struct rdt_resource *r_l = &rdt_resources_all[level];
1803         int ret;
1804
1805         if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
1806             !r_lcode->alloc_capable)
1807                 return -EINVAL;
1808
1809         ret = set_cache_qos_cfg(level, true);
1810         if (!ret) {
1811                 r_l->alloc_enabled = false;
1812                 r_ldata->alloc_enabled = true;
1813                 r_lcode->alloc_enabled = true;
1814         }
1815         return ret;
1816 }
1817
1818 static int cdpl3_enable(void)
1819 {
1820         return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
1821                           RDT_RESOURCE_L3CODE);
1822 }
1823
1824 static int cdpl2_enable(void)
1825 {
1826         return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
1827                           RDT_RESOURCE_L2CODE);
1828 }
1829
1830 static void cdp_disable(int level, int data_type, int code_type)
1831 {
1832         struct rdt_resource *r = &rdt_resources_all[level];
1833
1834         r->alloc_enabled = r->alloc_capable;
1835
1836         if (rdt_resources_all[data_type].alloc_enabled) {
1837                 rdt_resources_all[data_type].alloc_enabled = false;
1838                 rdt_resources_all[code_type].alloc_enabled = false;
1839                 set_cache_qos_cfg(level, false);
1840         }
1841 }
1842
1843 static void cdpl3_disable(void)
1844 {
1845         cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
1846 }
1847
1848 static void cdpl2_disable(void)
1849 {
1850         cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
1851 }
1852
1853 static void cdp_disable_all(void)
1854 {
1855         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
1856                 cdpl3_disable();
1857         if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
1858                 cdpl2_disable();
1859 }
1860
1861 static int parse_rdtgroupfs_options(char *data)
1862 {
1863         char *token, *o = data;
1864         int ret = 0;
1865
1866         while ((token = strsep(&o, ",")) != NULL) {
1867                 if (!*token) {
1868                         ret = -EINVAL;
1869                         goto out;
1870                 }
1871
1872                 if (!strcmp(token, "cdp")) {
1873                         ret = cdpl3_enable();
1874                         if (ret)
1875                                 goto out;
1876                 } else if (!strcmp(token, "cdpl2")) {
1877                         ret = cdpl2_enable();
1878                         if (ret)
1879                                 goto out;
1880                 } else if (!strcmp(token, "mba_MBps")) {
1881                         ret = set_mba_sc(true);
1882                         if (ret)
1883                                 goto out;
1884                 } else {
1885                         ret = -EINVAL;
1886                         goto out;
1887                 }
1888         }
1889
1890         return 0;
1891
1892 out:
1893         pr_err("Invalid mount option \"%s\"\n", token);
1894
1895         return ret;
1896 }
1897
1898 /*
1899  * We don't allow rdtgroup directories to be created anywhere
1900  * except the root directory. Thus when looking for the rdtgroup
1901  * structure for a kernfs node we are either looking at a directory,
1902  * in which case the rdtgroup structure is pointed at by the "priv"
1903  * field, otherwise we have a file, and need only look to the parent
1904  * to find the rdtgroup.
1905  */
1906 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
1907 {
1908         if (kernfs_type(kn) == KERNFS_DIR) {
1909                 /*
1910                  * All the resource directories use "kn->priv"
1911                  * to point to the "struct rdtgroup" for the
1912                  * resource. "info" and its subdirectories don't
1913                  * have rdtgroup structures, so return NULL here.
1914                  */
1915                 if (kn == kn_info || kn->parent == kn_info)
1916                         return NULL;
1917                 else
1918                         return kn->priv;
1919         } else {
1920                 return kn->parent->priv;
1921         }
1922 }
1923
1924 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
1925 {
1926         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1927
1928         if (!rdtgrp)
1929                 return NULL;
1930
1931         atomic_inc(&rdtgrp->waitcount);
1932         kernfs_break_active_protection(kn);
1933
1934         mutex_lock(&rdtgroup_mutex);
1935
1936         /* Was this group deleted while we waited? */
1937         if (rdtgrp->flags & RDT_DELETED)
1938                 return NULL;
1939
1940         return rdtgrp;
1941 }
1942
1943 void rdtgroup_kn_unlock(struct kernfs_node *kn)
1944 {
1945         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1946
1947         if (!rdtgrp)
1948                 return;
1949
1950         mutex_unlock(&rdtgroup_mutex);
1951
1952         if (atomic_dec_and_test(&rdtgrp->waitcount) &&
1953             (rdtgrp->flags & RDT_DELETED)) {
1954                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
1955                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
1956                         rdtgroup_pseudo_lock_remove(rdtgrp);
1957                 kernfs_unbreak_active_protection(kn);
1958                 kernfs_put(rdtgrp->kn);
1959                 kfree(rdtgrp);
1960         } else {
1961                 kernfs_unbreak_active_protection(kn);
1962         }
1963 }
1964
1965 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1966                              struct rdtgroup *prgrp,
1967                              struct kernfs_node **mon_data_kn);
1968
1969 static struct dentry *rdt_mount(struct file_system_type *fs_type,
1970                                 int flags, const char *unused_dev_name,
1971                                 void *data)
1972 {
1973         struct rdt_domain *dom;
1974         struct rdt_resource *r;
1975         struct dentry *dentry;
1976         int ret;
1977
1978         cpus_read_lock();
1979         mutex_lock(&rdtgroup_mutex);
1980         /*
1981          * resctrl file system can only be mounted once.
1982          */
1983         if (static_branch_unlikely(&rdt_enable_key)) {
1984                 dentry = ERR_PTR(-EBUSY);
1985                 goto out;
1986         }
1987
1988         ret = parse_rdtgroupfs_options(data);
1989         if (ret) {
1990                 dentry = ERR_PTR(ret);
1991                 goto out_cdp;
1992         }
1993
1994         closid_init();
1995
1996         ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
1997         if (ret) {
1998                 dentry = ERR_PTR(ret);
1999                 goto out_cdp;
2000         }
2001
2002         if (rdt_mon_capable) {
2003                 ret = mongroup_create_dir(rdtgroup_default.kn,
2004                                           NULL, "mon_groups",
2005                                           &kn_mongrp);
2006                 if (ret) {
2007                         dentry = ERR_PTR(ret);
2008                         goto out_info;
2009                 }
2010                 kernfs_get(kn_mongrp);
2011
2012                 ret = mkdir_mondata_all(rdtgroup_default.kn,
2013                                         &rdtgroup_default, &kn_mondata);
2014                 if (ret) {
2015                         dentry = ERR_PTR(ret);
2016                         goto out_mongrp;
2017                 }
2018                 kernfs_get(kn_mondata);
2019                 rdtgroup_default.mon.mon_data_kn = kn_mondata;
2020         }
2021
2022         ret = rdt_pseudo_lock_init();
2023         if (ret) {
2024                 dentry = ERR_PTR(ret);
2025                 goto out_mondata;
2026         }
2027
2028         dentry = kernfs_mount(fs_type, flags, rdt_root,
2029                               RDTGROUP_SUPER_MAGIC, NULL);
2030         if (IS_ERR(dentry))
2031                 goto out_psl;
2032
2033         if (rdt_alloc_capable)
2034                 static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
2035         if (rdt_mon_capable)
2036                 static_branch_enable_cpuslocked(&rdt_mon_enable_key);
2037
2038         if (rdt_alloc_capable || rdt_mon_capable)
2039                 static_branch_enable_cpuslocked(&rdt_enable_key);
2040
2041         if (is_mbm_enabled()) {
2042                 r = &rdt_resources_all[RDT_RESOURCE_L3];
2043                 list_for_each_entry(dom, &r->domains, list)
2044                         mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
2045         }
2046
2047         goto out;
2048
2049 out_psl:
2050         rdt_pseudo_lock_release();
2051 out_mondata:
2052         if (rdt_mon_capable)
2053                 kernfs_remove(kn_mondata);
2054 out_mongrp:
2055         if (rdt_mon_capable)
2056                 kernfs_remove(kn_mongrp);
2057 out_info:
2058         kernfs_remove(kn_info);
2059 out_cdp:
2060         cdp_disable_all();
2061 out:
2062         rdt_last_cmd_clear();
2063         mutex_unlock(&rdtgroup_mutex);
2064         cpus_read_unlock();
2065
2066         return dentry;
2067 }
2068
2069 static int reset_all_ctrls(struct rdt_resource *r)
2070 {
2071         struct msr_param msr_param;
2072         cpumask_var_t cpu_mask;
2073         struct rdt_domain *d;
2074         int i, cpu;
2075
2076         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2077                 return -ENOMEM;
2078
2079         msr_param.res = r;
2080         msr_param.low = 0;
2081         msr_param.high = r->num_closid;
2082
2083         /*
2084          * Disable resource control for this resource by setting all
2085          * CBMs in all domains to the maximum mask value. Pick one CPU
2086          * from each domain to update the MSRs below.
2087          */
2088         list_for_each_entry(d, &r->domains, list) {
2089                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
2090
2091                 for (i = 0; i < r->num_closid; i++)
2092                         d->ctrl_val[i] = r->default_ctrl;
2093         }
2094         cpu = get_cpu();
2095         /* Update CBM on this cpu if it's in cpu_mask. */
2096         if (cpumask_test_cpu(cpu, cpu_mask))
2097                 rdt_ctrl_update(&msr_param);
2098         /* Update CBM on all other cpus in cpu_mask. */
2099         smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
2100         put_cpu();
2101
2102         free_cpumask_var(cpu_mask);
2103
2104         return 0;
2105 }
2106
2107 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
2108 {
2109         return (rdt_alloc_capable &&
2110                 (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
2111 }
2112
2113 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
2114 {
2115         return (rdt_mon_capable &&
2116                 (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
2117 }
2118
2119 /*
2120  * Move tasks from one to the other group. If @from is NULL, then all tasks
2121  * in the systems are moved unconditionally (used for teardown).
2122  *
2123  * If @mask is not NULL the cpus on which moved tasks are running are set
2124  * in that mask so the update smp function call is restricted to affected
2125  * cpus.
2126  */
2127 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2128                                  struct cpumask *mask)
2129 {
2130         struct task_struct *p, *t;
2131
2132         read_lock(&tasklist_lock);
2133         for_each_process_thread(p, t) {
2134                 if (!from || is_closid_match(t, from) ||
2135                     is_rmid_match(t, from)) {
2136                         t->closid = to->closid;
2137                         t->rmid = to->mon.rmid;
2138
2139 #ifdef CONFIG_SMP
2140                         /*
2141                          * This is safe on x86 w/o barriers as the ordering
2142                          * of writing to task_cpu() and t->on_cpu is
2143                          * reverse to the reading here. The detection is
2144                          * inaccurate as tasks might move or schedule
2145                          * before the smp function call takes place. In
2146                          * such a case the function call is pointless, but
2147                          * there is no other side effect.
2148                          */
2149                         if (mask && t->on_cpu)
2150                                 cpumask_set_cpu(task_cpu(t), mask);
2151 #endif
2152                 }
2153         }
2154         read_unlock(&tasklist_lock);
2155 }
2156
2157 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2158 {
2159         struct rdtgroup *sentry, *stmp;
2160         struct list_head *head;
2161
2162         head = &rdtgrp->mon.crdtgrp_list;
2163         list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2164                 free_rmid(sentry->mon.rmid);
2165                 list_del(&sentry->mon.crdtgrp_list);
2166                 kfree(sentry);
2167         }
2168 }
2169
2170 /*
2171  * Forcibly remove all of subdirectories under root.
2172  */
2173 static void rmdir_all_sub(void)
2174 {
2175         struct rdtgroup *rdtgrp, *tmp;
2176
2177         /* Move all tasks to the default resource group */
2178         rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2179
2180         list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2181                 /* Free any child rmids */
2182                 free_all_child_rdtgrp(rdtgrp);
2183
2184                 /* Remove each rdtgroup other than root */
2185                 if (rdtgrp == &rdtgroup_default)
2186                         continue;
2187
2188                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2189                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2190                         rdtgroup_pseudo_lock_remove(rdtgrp);
2191
2192                 /*
2193                  * Give any CPUs back to the default group. We cannot copy
2194                  * cpu_online_mask because a CPU might have executed the
2195                  * offline callback already, but is still marked online.
2196                  */
2197                 cpumask_or(&rdtgroup_default.cpu_mask,
2198                            &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2199
2200                 free_rmid(rdtgrp->mon.rmid);
2201
2202                 kernfs_remove(rdtgrp->kn);
2203                 list_del(&rdtgrp->rdtgroup_list);
2204                 kfree(rdtgrp);
2205         }
2206         /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2207         update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2208
2209         kernfs_remove(kn_info);
2210         kernfs_remove(kn_mongrp);
2211         kernfs_remove(kn_mondata);
2212 }
2213
2214 static void rdt_kill_sb(struct super_block *sb)
2215 {
2216         struct rdt_resource *r;
2217
2218         cpus_read_lock();
2219         mutex_lock(&rdtgroup_mutex);
2220
2221         set_mba_sc(false);
2222
2223         /*Put everything back to default values. */
2224         for_each_alloc_enabled_rdt_resource(r)
2225                 reset_all_ctrls(r);
2226         cdp_disable_all();
2227         rmdir_all_sub();
2228         rdt_pseudo_lock_release();
2229         rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2230         static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
2231         static_branch_disable_cpuslocked(&rdt_mon_enable_key);
2232         static_branch_disable_cpuslocked(&rdt_enable_key);
2233         kernfs_kill_sb(sb);
2234         mutex_unlock(&rdtgroup_mutex);
2235         cpus_read_unlock();
2236 }
2237
2238 static struct file_system_type rdt_fs_type = {
2239         .name    = "resctrl",
2240         .mount   = rdt_mount,
2241         .kill_sb = rdt_kill_sb,
2242 };
2243
2244 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2245                        void *priv)
2246 {
2247         struct kernfs_node *kn;
2248         int ret = 0;
2249
2250         kn = __kernfs_create_file(parent_kn, name, 0444,
2251                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2252                                   &kf_mondata_ops, priv, NULL, NULL);
2253         if (IS_ERR(kn))
2254                 return PTR_ERR(kn);
2255
2256         ret = rdtgroup_kn_set_ugid(kn);
2257         if (ret) {
2258                 kernfs_remove(kn);
2259                 return ret;
2260         }
2261
2262         return ret;
2263 }
2264
2265 /*
2266  * Remove all subdirectories of mon_data of ctrl_mon groups
2267  * and monitor groups with given domain id.
2268  */
2269 void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
2270 {
2271         struct rdtgroup *prgrp, *crgrp;
2272         char name[32];
2273
2274         if (!r->mon_enabled)
2275                 return;
2276
2277         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2278                 sprintf(name, "mon_%s_%02d", r->name, dom_id);
2279                 kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
2280
2281                 list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
2282                         kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
2283         }
2284 }
2285
2286 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
2287                                 struct rdt_domain *d,
2288                                 struct rdt_resource *r, struct rdtgroup *prgrp)
2289 {
2290         union mon_data_bits priv;
2291         struct kernfs_node *kn;
2292         struct mon_evt *mevt;
2293         struct rmid_read rr;
2294         char name[32];
2295         int ret;
2296
2297         sprintf(name, "mon_%s_%02d", r->name, d->id);
2298         /* create the directory */
2299         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2300         if (IS_ERR(kn))
2301                 return PTR_ERR(kn);
2302
2303         /*
2304          * This extra ref will be put in kernfs_remove() and guarantees
2305          * that kn is always accessible.
2306          */
2307         kernfs_get(kn);
2308         ret = rdtgroup_kn_set_ugid(kn);
2309         if (ret)
2310                 goto out_destroy;
2311
2312         if (WARN_ON(list_empty(&r->evt_list))) {
2313                 ret = -EPERM;
2314                 goto out_destroy;
2315         }
2316
2317         priv.u.rid = r->rid;
2318         priv.u.domid = d->id;
2319         list_for_each_entry(mevt, &r->evt_list, list) {
2320                 priv.u.evtid = mevt->evtid;
2321                 ret = mon_addfile(kn, mevt->name, priv.priv);
2322                 if (ret)
2323                         goto out_destroy;
2324
2325                 if (is_mbm_event(mevt->evtid))
2326                         mon_event_read(&rr, d, prgrp, mevt->evtid, true);
2327         }
2328         kernfs_activate(kn);
2329         return 0;
2330
2331 out_destroy:
2332         kernfs_remove(kn);
2333         return ret;
2334 }
2335
2336 /*
2337  * Add all subdirectories of mon_data for "ctrl_mon" groups
2338  * and "monitor" groups with given domain id.
2339  */
2340 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
2341                                     struct rdt_domain *d)
2342 {
2343         struct kernfs_node *parent_kn;
2344         struct rdtgroup *prgrp, *crgrp;
2345         struct list_head *head;
2346
2347         if (!r->mon_enabled)
2348                 return;
2349
2350         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2351                 parent_kn = prgrp->mon.mon_data_kn;
2352                 mkdir_mondata_subdir(parent_kn, d, r, prgrp);
2353
2354                 head = &prgrp->mon.crdtgrp_list;
2355                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
2356                         parent_kn = crgrp->mon.mon_data_kn;
2357                         mkdir_mondata_subdir(parent_kn, d, r, crgrp);
2358                 }
2359         }
2360 }
2361
2362 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
2363                                        struct rdt_resource *r,
2364                                        struct rdtgroup *prgrp)
2365 {
2366         struct rdt_domain *dom;
2367         int ret;
2368
2369         list_for_each_entry(dom, &r->domains, list) {
2370                 ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
2371                 if (ret)
2372                         return ret;
2373         }
2374
2375         return 0;
2376 }
2377
2378 /*
2379  * This creates a directory mon_data which contains the monitored data.
2380  *
2381  * mon_data has one directory for each domain whic are named
2382  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
2383  * with L3 domain looks as below:
2384  * ./mon_data:
2385  * mon_L3_00
2386  * mon_L3_01
2387  * mon_L3_02
2388  * ...
2389  *
2390  * Each domain directory has one file per event:
2391  * ./mon_L3_00/:
2392  * llc_occupancy
2393  *
2394  */
2395 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2396                              struct rdtgroup *prgrp,
2397                              struct kernfs_node **dest_kn)
2398 {
2399         struct rdt_resource *r;
2400         struct kernfs_node *kn;
2401         int ret;
2402
2403         /*
2404          * Create the mon_data directory first.
2405          */
2406         ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
2407         if (ret)
2408                 return ret;
2409
2410         if (dest_kn)
2411                 *dest_kn = kn;
2412
2413         /*
2414          * Create the subdirectories for each domain. Note that all events
2415          * in a domain like L3 are grouped into a resource whose domain is L3
2416          */
2417         for_each_mon_enabled_rdt_resource(r) {
2418                 ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
2419                 if (ret)
2420                         goto out_destroy;
2421         }
2422
2423         return 0;
2424
2425 out_destroy:
2426         kernfs_remove(kn);
2427         return ret;
2428 }
2429
2430 /**
2431  * cbm_ensure_valid - Enforce validity on provided CBM
2432  * @_val:       Candidate CBM
2433  * @r:          RDT resource to which the CBM belongs
2434  *
2435  * The provided CBM represents all cache portions available for use. This
2436  * may be represented by a bitmap that does not consist of contiguous ones
2437  * and thus be an invalid CBM.
2438  * Here the provided CBM is forced to be a valid CBM by only considering
2439  * the first set of contiguous bits as valid and clearing all bits.
2440  * The intention here is to provide a valid default CBM with which a new
2441  * resource group is initialized. The user can follow this with a
2442  * modification to the CBM if the default does not satisfy the
2443  * requirements.
2444  */
2445 static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
2446 {
2447         /*
2448          * Convert the u32 _val to an unsigned long required by all the bit
2449          * operations within this function. No more than 32 bits of this
2450          * converted value can be accessed because all bit operations are
2451          * additionally provided with cbm_len that is initialized during
2452          * hardware enumeration using five bits from the EAX register and
2453          * thus never can exceed 32 bits.
2454          */
2455         unsigned long *val = (unsigned long *)_val;
2456         unsigned int cbm_len = r->cache.cbm_len;
2457         unsigned long first_bit, zero_bit;
2458
2459         if (*val == 0)
2460                 return;
2461
2462         first_bit = find_first_bit(val, cbm_len);
2463         zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
2464
2465         /* Clear any remaining bits to ensure contiguous region */
2466         bitmap_clear(val, zero_bit, cbm_len - zero_bit);
2467 }
2468
2469 /**
2470  * rdtgroup_init_alloc - Initialize the new RDT group's allocations
2471  *
2472  * A new RDT group is being created on an allocation capable (CAT)
2473  * supporting system. Set this group up to start off with all usable
2474  * allocations. That is, all shareable and unused bits.
2475  *
2476  * All-zero CBM is invalid. If there are no more shareable bits available
2477  * on any domain then the entire allocation will fail.
2478  */
2479 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
2480 {
2481         struct rdt_resource *r_cdp = NULL;
2482         struct rdt_domain *d_cdp = NULL;
2483         u32 used_b = 0, unused_b = 0;
2484         u32 closid = rdtgrp->closid;
2485         struct rdt_resource *r;
2486         unsigned long tmp_cbm;
2487         enum rdtgrp_mode mode;
2488         struct rdt_domain *d;
2489         u32 peer_ctl, *ctrl;
2490         int i, ret;
2491
2492         for_each_alloc_enabled_rdt_resource(r) {
2493                 /*
2494                  * Only initialize default allocations for CBM cache
2495                  * resources
2496                  */
2497                 if (r->rid == RDT_RESOURCE_MBA)
2498                         continue;
2499                 list_for_each_entry(d, &r->domains, list) {
2500                         rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp);
2501                         d->have_new_ctrl = false;
2502                         d->new_ctrl = r->cache.shareable_bits;
2503                         used_b = r->cache.shareable_bits;
2504                         ctrl = d->ctrl_val;
2505                         for (i = 0; i < closids_supported(); i++, ctrl++) {
2506                                 if (closid_allocated(i) && i != closid) {
2507                                         mode = rdtgroup_mode_by_closid(i);
2508                                         if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
2509                                                 break;
2510                                         /*
2511                                          * If CDP is active include peer
2512                                          * domain's usage to ensure there
2513                                          * is no overlap with an exclusive
2514                                          * group.
2515                                          */
2516                                         if (d_cdp)
2517                                                 peer_ctl = d_cdp->ctrl_val[i];
2518                                         else
2519                                                 peer_ctl = 0;
2520                                         used_b |= *ctrl | peer_ctl;
2521                                         if (mode == RDT_MODE_SHAREABLE)
2522                                                 d->new_ctrl |= *ctrl | peer_ctl;
2523                                 }
2524                         }
2525                         if (d->plr && d->plr->cbm > 0)
2526                                 used_b |= d->plr->cbm;
2527                         unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
2528                         unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
2529                         d->new_ctrl |= unused_b;
2530                         /*
2531                          * Force the initial CBM to be valid, user can
2532                          * modify the CBM based on system availability.
2533                          */
2534                         cbm_ensure_valid(&d->new_ctrl, r);
2535                         /*
2536                          * Assign the u32 CBM to an unsigned long to ensure
2537                          * that bitmap_weight() does not access out-of-bound
2538                          * memory.
2539                          */
2540                         tmp_cbm = d->new_ctrl;
2541                         if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
2542                             r->cache.min_cbm_bits) {
2543                                 rdt_last_cmd_printf("no space on %s:%d\n",
2544                                                     r->name, d->id);
2545                                 return -ENOSPC;
2546                         }
2547                         d->have_new_ctrl = true;
2548                 }
2549         }
2550
2551         for_each_alloc_enabled_rdt_resource(r) {
2552                 /*
2553                  * Only initialize default allocations for CBM cache
2554                  * resources
2555                  */
2556                 if (r->rid == RDT_RESOURCE_MBA)
2557                         continue;
2558                 ret = update_domains(r, rdtgrp->closid);
2559                 if (ret < 0) {
2560                         rdt_last_cmd_puts("failed to initialize allocations\n");
2561                         return ret;
2562                 }
2563                 rdtgrp->mode = RDT_MODE_SHAREABLE;
2564         }
2565
2566         return 0;
2567 }
2568
2569 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
2570                              struct kernfs_node *prgrp_kn,
2571                              const char *name, umode_t mode,
2572                              enum rdt_group_type rtype, struct rdtgroup **r)
2573 {
2574         struct rdtgroup *prdtgrp, *rdtgrp;
2575         struct kernfs_node *kn;
2576         uint files = 0;
2577         int ret;
2578
2579         prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
2580         rdt_last_cmd_clear();
2581         if (!prdtgrp) {
2582                 ret = -ENODEV;
2583                 rdt_last_cmd_puts("directory was removed\n");
2584                 goto out_unlock;
2585         }
2586
2587         if (rtype == RDTMON_GROUP &&
2588             (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2589              prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
2590                 ret = -EINVAL;
2591                 rdt_last_cmd_puts("pseudo-locking in progress\n");
2592                 goto out_unlock;
2593         }
2594
2595         /* allocate the rdtgroup. */
2596         rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
2597         if (!rdtgrp) {
2598                 ret = -ENOSPC;
2599                 rdt_last_cmd_puts("kernel out of memory\n");
2600                 goto out_unlock;
2601         }
2602         *r = rdtgrp;
2603         rdtgrp->mon.parent = prdtgrp;
2604         rdtgrp->type = rtype;
2605         INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
2606
2607         /* kernfs creates the directory for rdtgrp */
2608         kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
2609         if (IS_ERR(kn)) {
2610                 ret = PTR_ERR(kn);
2611                 rdt_last_cmd_puts("kernfs create error\n");
2612                 goto out_free_rgrp;
2613         }
2614         rdtgrp->kn = kn;
2615
2616         /*
2617          * kernfs_remove() will drop the reference count on "kn" which
2618          * will free it. But we still need it to stick around for the
2619          * rdtgroup_kn_unlock(kn} call below. Take one extra reference
2620          * here, which will be dropped inside rdtgroup_kn_unlock().
2621          */
2622         kernfs_get(kn);
2623
2624         ret = rdtgroup_kn_set_ugid(kn);
2625         if (ret) {
2626                 rdt_last_cmd_puts("kernfs perm error\n");
2627                 goto out_destroy;
2628         }
2629
2630         files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
2631         ret = rdtgroup_add_files(kn, files);
2632         if (ret) {
2633                 rdt_last_cmd_puts("kernfs fill error\n");
2634                 goto out_destroy;
2635         }
2636
2637         if (rdt_mon_capable) {
2638                 ret = alloc_rmid();
2639                 if (ret < 0) {
2640                         rdt_last_cmd_puts("out of RMIDs\n");
2641                         goto out_destroy;
2642                 }
2643                 rdtgrp->mon.rmid = ret;
2644
2645                 ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
2646                 if (ret) {
2647                         rdt_last_cmd_puts("kernfs subdir error\n");
2648                         goto out_idfree;
2649                 }
2650         }
2651         kernfs_activate(kn);
2652
2653         /*
2654          * The caller unlocks the prgrp_kn upon success.
2655          */
2656         return 0;
2657
2658 out_idfree:
2659         free_rmid(rdtgrp->mon.rmid);
2660 out_destroy:
2661         kernfs_remove(rdtgrp->kn);
2662 out_free_rgrp:
2663         kfree(rdtgrp);
2664 out_unlock:
2665         rdtgroup_kn_unlock(prgrp_kn);
2666         return ret;
2667 }
2668
2669 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
2670 {
2671         kernfs_remove(rgrp->kn);
2672         free_rmid(rgrp->mon.rmid);
2673         kfree(rgrp);
2674 }
2675
2676 /*
2677  * Create a monitor group under "mon_groups" directory of a control
2678  * and monitor group(ctrl_mon). This is a resource group
2679  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
2680  */
2681 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
2682                               struct kernfs_node *prgrp_kn,
2683                               const char *name,
2684                               umode_t mode)
2685 {
2686         struct rdtgroup *rdtgrp, *prgrp;
2687         int ret;
2688
2689         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
2690                                 &rdtgrp);
2691         if (ret)
2692                 return ret;
2693
2694         prgrp = rdtgrp->mon.parent;
2695         rdtgrp->closid = prgrp->closid;
2696
2697         /*
2698          * Add the rdtgrp to the list of rdtgrps the parent
2699          * ctrl_mon group has to track.
2700          */
2701         list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
2702
2703         rdtgroup_kn_unlock(prgrp_kn);
2704         return ret;
2705 }
2706
2707 /*
2708  * These are rdtgroups created under the root directory. Can be used
2709  * to allocate and monitor resources.
2710  */
2711 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
2712                                    struct kernfs_node *prgrp_kn,
2713                                    const char *name, umode_t mode)
2714 {
2715         struct rdtgroup *rdtgrp;
2716         struct kernfs_node *kn;
2717         u32 closid;
2718         int ret;
2719
2720         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
2721                                 &rdtgrp);
2722         if (ret)
2723                 return ret;
2724
2725         kn = rdtgrp->kn;
2726         ret = closid_alloc();
2727         if (ret < 0) {
2728                 rdt_last_cmd_puts("out of CLOSIDs\n");
2729                 goto out_common_fail;
2730         }
2731         closid = ret;
2732         ret = 0;
2733
2734         rdtgrp->closid = closid;
2735         ret = rdtgroup_init_alloc(rdtgrp);
2736         if (ret < 0)
2737                 goto out_id_free;
2738
2739         list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
2740
2741         if (rdt_mon_capable) {
2742                 /*
2743                  * Create an empty mon_groups directory to hold the subset
2744                  * of tasks and cpus to monitor.
2745                  */
2746                 ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
2747                 if (ret) {
2748                         rdt_last_cmd_puts("kernfs subdir error\n");
2749                         goto out_del_list;
2750                 }
2751         }
2752
2753         goto out_unlock;
2754
2755 out_del_list:
2756         list_del(&rdtgrp->rdtgroup_list);
2757 out_id_free:
2758         closid_free(closid);
2759 out_common_fail:
2760         mkdir_rdt_prepare_clean(rdtgrp);
2761 out_unlock:
2762         rdtgroup_kn_unlock(prgrp_kn);
2763         return ret;
2764 }
2765
2766 /*
2767  * We allow creating mon groups only with in a directory called "mon_groups"
2768  * which is present in every ctrl_mon group. Check if this is a valid
2769  * "mon_groups" directory.
2770  *
2771  * 1. The directory should be named "mon_groups".
2772  * 2. The mon group itself should "not" be named "mon_groups".
2773  *   This makes sure "mon_groups" directory always has a ctrl_mon group
2774  *   as parent.
2775  */
2776 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
2777 {
2778         return (!strcmp(kn->name, "mon_groups") &&
2779                 strcmp(name, "mon_groups"));
2780 }
2781
2782 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
2783                           umode_t mode)
2784 {
2785         /* Do not accept '\n' to avoid unparsable situation. */
2786         if (strchr(name, '\n'))
2787                 return -EINVAL;
2788
2789         /*
2790          * If the parent directory is the root directory and RDT
2791          * allocation is supported, add a control and monitoring
2792          * subdirectory
2793          */
2794         if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
2795                 return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
2796
2797         /*
2798          * If RDT monitoring is supported and the parent directory is a valid
2799          * "mon_groups" directory, add a monitoring subdirectory.
2800          */
2801         if (rdt_mon_capable && is_mon_groups(parent_kn, name))
2802                 return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
2803
2804         return -EPERM;
2805 }
2806
2807 static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2808                               cpumask_var_t tmpmask)
2809 {
2810         struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
2811         int cpu;
2812
2813         /* Give any tasks back to the parent group */
2814         rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
2815
2816         /* Update per cpu rmid of the moved CPUs first */
2817         for_each_cpu(cpu, &rdtgrp->cpu_mask)
2818                 per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
2819         /*
2820          * Update the MSR on moved CPUs and CPUs which have moved
2821          * task running on them.
2822          */
2823         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2824         update_closid_rmid(tmpmask, NULL);
2825
2826         rdtgrp->flags = RDT_DELETED;
2827         free_rmid(rdtgrp->mon.rmid);
2828
2829         /*
2830          * Remove the rdtgrp from the parent ctrl_mon group's list
2831          */
2832         WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
2833         list_del(&rdtgrp->mon.crdtgrp_list);
2834
2835         /*
2836          * one extra hold on this, will drop when we kfree(rdtgrp)
2837          * in rdtgroup_kn_unlock()
2838          */
2839         kernfs_get(kn);
2840         kernfs_remove(rdtgrp->kn);
2841
2842         return 0;
2843 }
2844
2845 static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
2846                                 struct rdtgroup *rdtgrp)
2847 {
2848         rdtgrp->flags = RDT_DELETED;
2849         list_del(&rdtgrp->rdtgroup_list);
2850
2851         /*
2852          * one extra hold on this, will drop when we kfree(rdtgrp)
2853          * in rdtgroup_kn_unlock()
2854          */
2855         kernfs_get(kn);
2856         kernfs_remove(rdtgrp->kn);
2857         return 0;
2858 }
2859
2860 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2861                                cpumask_var_t tmpmask)
2862 {
2863         int cpu;
2864
2865         /* Give any tasks back to the default group */
2866         rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
2867
2868         /* Give any CPUs back to the default group */
2869         cpumask_or(&rdtgroup_default.cpu_mask,
2870                    &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2871
2872         /* Update per cpu closid and rmid of the moved CPUs first */
2873         for_each_cpu(cpu, &rdtgrp->cpu_mask) {
2874                 per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
2875                 per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
2876         }
2877
2878         /*
2879          * Update the MSR on moved CPUs and CPUs which have moved
2880          * task running on them.
2881          */
2882         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2883         update_closid_rmid(tmpmask, NULL);
2884
2885         closid_free(rdtgrp->closid);
2886         free_rmid(rdtgrp->mon.rmid);
2887
2888         /*
2889          * Free all the child monitor group rmids.
2890          */
2891         free_all_child_rdtgrp(rdtgrp);
2892
2893         rdtgroup_ctrl_remove(kn, rdtgrp);
2894
2895         return 0;
2896 }
2897
2898 static int rdtgroup_rmdir(struct kernfs_node *kn)
2899 {
2900         struct kernfs_node *parent_kn = kn->parent;
2901         struct rdtgroup *rdtgrp;
2902         cpumask_var_t tmpmask;
2903         int ret = 0;
2904
2905         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
2906                 return -ENOMEM;
2907
2908         rdtgrp = rdtgroup_kn_lock_live(kn);
2909         if (!rdtgrp) {
2910                 ret = -EPERM;
2911                 goto out;
2912         }
2913
2914         /*
2915          * If the rdtgroup is a ctrl_mon group and parent directory
2916          * is the root directory, remove the ctrl_mon group.
2917          *
2918          * If the rdtgroup is a mon group and parent directory
2919          * is a valid "mon_groups" directory, remove the mon group.
2920          */
2921         if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
2922                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2923                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
2924                         ret = rdtgroup_ctrl_remove(kn, rdtgrp);
2925                 } else {
2926                         ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
2927                 }
2928         } else if (rdtgrp->type == RDTMON_GROUP &&
2929                  is_mon_groups(parent_kn, kn->name)) {
2930                 ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
2931         } else {
2932                 ret = -EPERM;
2933         }
2934
2935 out:
2936         rdtgroup_kn_unlock(kn);
2937         free_cpumask_var(tmpmask);
2938         return ret;
2939 }
2940
2941 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
2942 {
2943         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
2944                 seq_puts(seq, ",cdp");
2945
2946         if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
2947                 seq_puts(seq, ",cdpl2");
2948
2949         if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
2950                 seq_puts(seq, ",mba_MBps");
2951
2952         return 0;
2953 }
2954
2955 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
2956         .mkdir          = rdtgroup_mkdir,
2957         .rmdir          = rdtgroup_rmdir,
2958         .show_options   = rdtgroup_show_options,
2959 };
2960
2961 static int __init rdtgroup_setup_root(void)
2962 {
2963         int ret;
2964
2965         rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
2966                                       KERNFS_ROOT_CREATE_DEACTIVATED |
2967                                       KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
2968                                       &rdtgroup_default);
2969         if (IS_ERR(rdt_root))
2970                 return PTR_ERR(rdt_root);
2971
2972         mutex_lock(&rdtgroup_mutex);
2973
2974         rdtgroup_default.closid = 0;
2975         rdtgroup_default.mon.rmid = 0;
2976         rdtgroup_default.type = RDTCTRL_GROUP;
2977         INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
2978
2979         list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
2980
2981         ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
2982         if (ret) {
2983                 kernfs_destroy_root(rdt_root);
2984                 goto out;
2985         }
2986
2987         rdtgroup_default.kn = rdt_root->kn;
2988         kernfs_activate(rdtgroup_default.kn);
2989
2990 out:
2991         mutex_unlock(&rdtgroup_mutex);
2992
2993         return ret;
2994 }
2995
2996 /*
2997  * rdtgroup_init - rdtgroup initialization
2998  *
2999  * Setup resctrl file system including set up root, create mount point,
3000  * register rdtgroup filesystem, and initialize files under root directory.
3001  *
3002  * Return: 0 on success or -errno
3003  */
3004 int __init rdtgroup_init(void)
3005 {
3006         int ret = 0;
3007
3008         seq_buf_init(&last_cmd_status, last_cmd_status_buf,
3009                      sizeof(last_cmd_status_buf));
3010
3011         ret = rdtgroup_setup_root();
3012         if (ret)
3013                 return ret;
3014
3015         ret = sysfs_create_mount_point(fs_kobj, "resctrl");
3016         if (ret)
3017                 goto cleanup_root;
3018
3019         ret = register_filesystem(&rdt_fs_type);
3020         if (ret)
3021                 goto cleanup_mountpoint;
3022
3023         /*
3024          * Adding the resctrl debugfs directory here may not be ideal since
3025          * it would let the resctrl debugfs directory appear on the debugfs
3026          * filesystem before the resctrl filesystem is mounted.
3027          * It may also be ok since that would enable debugging of RDT before
3028          * resctrl is mounted.
3029          * The reason why the debugfs directory is created here and not in
3030          * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
3031          * during the debugfs directory creation also &sb->s_type->i_mutex_key
3032          * (the lockdep class of inode->i_rwsem). Other filesystem
3033          * interactions (eg. SyS_getdents) have the lock ordering:
3034          * &sb->s_type->i_mutex_key --> &mm->mmap_sem
3035          * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
3036          * is taken, thus creating dependency:
3037          * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
3038          * issues considering the other two lock dependencies.
3039          * By creating the debugfs directory here we avoid a dependency
3040          * that may cause deadlock (even though file operations cannot
3041          * occur until the filesystem is mounted, but I do not know how to
3042          * tell lockdep that).
3043          */
3044         debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
3045
3046         return 0;
3047
3048 cleanup_mountpoint:
3049         sysfs_remove_mount_point(fs_kobj, "resctrl");
3050 cleanup_root:
3051         kernfs_destroy_root(rdt_root);
3052
3053         return ret;
3054 }
3055
3056 void __exit rdtgroup_exit(void)
3057 {
3058         debugfs_remove_recursive(debugfs_resctrl);
3059         unregister_filesystem(&rdt_fs_type);
3060         sysfs_remove_mount_point(fs_kobj, "resctrl");
3061         kernfs_destroy_root(rdt_root);
3062 }