arch/x86/events/intel/rapl.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Support Intel RAPL energy consumption counters
   4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
   5  *
   6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   7  * section 14.7.1 (September 2013)
   8  *
   9  * RAPL provides more controls than just reporting energy consumption
  10  * however here we only expose the 3 energy consumption free running
  11  * counters (pp0, pkg, dram).
  12  *
  13  * Each of those counters increments in a power unit defined by the
  14  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  15  * but it can vary.
  16  *
  17  * Counter to rapl events mappings:
  18  *
  19  *  pp0 counter: consumption of all physical cores (power plane 0)
  20  *        event: rapl_energy_cores
  21  *    perf code: 0x1
  22  *
  23  *  pkg counter: consumption of the whole processor package
  24  *        event: rapl_energy_pkg
  25  *    perf code: 0x2
  26  *
  27  * dram counter: consumption of the dram domain (servers only)
  28  *        event: rapl_energy_dram
  29  *    perf code: 0x3
  30  *
  31  * gpu counter: consumption of the builtin-gpu domain (client only)
  32  *        event: rapl_energy_gpu
  33  *    perf code: 0x4
  34  *
  35  *  psys counter: consumption of the builtin-psys domain (client only)
  36  *        event: rapl_energy_psys
  37  *    perf code: 0x5
  38  *
  39  * We manage those counters as free running (read-only). They may be
  40  * use simultaneously by other tools, such as turbostat.
  41  *
  42  * The events only support system-wide mode counting. There is no
  43  * sampling support because it does not make sense and is not
  44  * supported by the RAPL hardware.
  45  *
  46  * Because we want to avoid floating-point operations in the kernel,
  47  * the events are all reported in fixed point arithmetic (32.32).
  48  * Tools must adjust the counts to convert them to Watts using
  49  * the duration of the measurement. Tools may use a function such as
  50  * ldexp(raw_count, -32);
  51  */
  52
  53 #define pr_fmt(fmt) "RAPL PMU: " fmt
  54
  55 #include <linux/module.h>
  56 #include <linux/slab.h>
  57 #include <linux/perf_event.h>
  58 #include <linux/nospec.h>
  59 #include <asm/cpu_device_id.h>
  60 #include <asm/intel-family.h>
  61 #include "../perf_event.h"
  62 #include "../probe.h"
  63
  64 MODULE_LICENSE("GPL");
  65
  66 /*
  67  * RAPL energy status counters
  68  */
  69 enum perf_rapl_events {
  70         PERF_RAPL_PP0 = 0,              /* all cores */
  71         PERF_RAPL_PKG,                  /* entire package */
  72         PERF_RAPL_RAM,                  /* DRAM */
  73         PERF_RAPL_PP1,                  /* gpu */
  74         PERF_RAPL_PSYS,                 /* psys */
  75
  76         PERF_RAPL_MAX,
  77         NR_RAPL_DOMAINS = PERF_RAPL_MAX,
  78 };
  79
  80 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  81         "pp0-core",
  82         "package",
  83         "dram",
  84         "pp1-gpu",
  85         "psys",
  86 };
  87
  88 /*
  89  * event code: LSB 8 bits, passed in attr->config
  90  * any other bit is reserved
  91  */
  92 #define RAPL_EVENT_MASK 0xFFULL
  93
  94 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
  95 static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
  96                                 struct kobj_attribute *attr,    \
  97                                 char *page)                     \
  98 {                                                               \
  99         BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
 100         return sprintf(page, _format "\n");                     \
 101 }                                                               \
 102 static struct kobj_attribute format_attr_##_var =               \
 103         __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 104
 105 #define RAPL_CNTR_WIDTH 32
 106
 107 #define RAPL_EVENT_ATTR_STR(_name, v, str)                                      \
 108 static struct perf_pmu_events_attr event_attr_##v = {                           \
 109         .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
 110         .id             = 0,                                                    \
 111         .event_str      = str,                                                  \
 112 };
 113
 114 struct rapl_pmu {
 115         raw_spinlock_t          lock;
 116         int                     n_active;
 117         int                     cpu;
 118         struct list_head        active_list;
 119         struct pmu              *pmu;
 120         ktime_t                 timer_interval;
 121         struct hrtimer          hrtimer;
 122 };
 123
 124 struct rapl_pmus {
 125         struct pmu              pmu;
 126         unsigned int            maxdie;
 127         struct rapl_pmu         *pmus[];
 128 };
 129
 130 struct rapl_model {
 131         unsigned long   events;
 132         bool            apply_quirk;
 133 };
 134
 135  /* 1/2^hw_unit Joule */
 136 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 137 static struct rapl_pmus *rapl_pmus;
 138 static cpumask_t rapl_cpu_mask;
 139 static unsigned int rapl_cntr_mask;
 140 static u64 rapl_timer_ms;
 141 static struct perf_msr rapl_msrs[];
 142
 143 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 144 {
 145         unsigned int dieid = topology_logical_die_id(cpu);
 146
 147         /*
 148          * The unsigned check also catches the '-1' return value for non
 149          * existent mappings in the topology map.
 150          */
 151         return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
 152 }
 153
 154 static inline u64 rapl_read_counter(struct perf_event *event)
 155 {
 156         u64 raw;
 157         rdmsrl(event->hw.event_base, raw);
 158         return raw;
 159 }
 160
 161 static inline u64 rapl_scale(u64 v, int cfg)
 162 {
 163         if (cfg > NR_RAPL_DOMAINS) {
 164                 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 165                 return v;
 166         }
 167         /*
 168          * scale delta to smallest unit (1/2^32)
 169          * users must then scale back: count * 1/(1e9*2^32) to get Joules
 170          * or use ldexp(count, -32).
 171          * Watts = Joules/Time delta
 172          */
 173         return v << (32 - rapl_hw_unit[cfg - 1]);
 174 }
 175
 176 static u64 rapl_event_update(struct perf_event *event)
 177 {
 178         struct hw_perf_event *hwc = &event->hw;
 179         u64 prev_raw_count, new_raw_count;
 180         s64 delta, sdelta;
 181         int shift = RAPL_CNTR_WIDTH;
 182
 183 again:
 184         prev_raw_count = local64_read(&hwc->prev_count);
 185         rdmsrl(event->hw.event_base, new_raw_count);
 186
 187         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 188                             new_raw_count) != prev_raw_count) {
 189                 cpu_relax();
 190                 goto again;
 191         }
 192
 193         /*
 194          * Now we have the new raw value and have updated the prev
 195          * timestamp already. We can now calculate the elapsed delta
 196          * (event-)time and add that to the generic event.
 197          *
 198          * Careful, not all hw sign-extends above the physical width
 199          * of the count.
 200          */
 201         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 202         delta >>= shift;
 203
 204         sdelta = rapl_scale(delta, event->hw.config);
 205
 206         local64_add(sdelta, &event->count);
 207
 208         return new_raw_count;
 209 }
 210
 211 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 212 {
 213        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
 214                      HRTIMER_MODE_REL_PINNED);
 215 }
 216
 217 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 218 {
 219         struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 220         struct perf_event *event;
 221         unsigned long flags;
 222
 223         if (!pmu->n_active)
 224                 return HRTIMER_NORESTART;
 225
 226         raw_spin_lock_irqsave(&pmu->lock, flags);
 227
 228         list_for_each_entry(event, &pmu->active_list, active_entry)
 229                 rapl_event_update(event);
 230
 231         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 232
 233         hrtimer_forward_now(hrtimer, pmu->timer_interval);
 234
 235         return HRTIMER_RESTART;
 236 }
 237
 238 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 239 {
 240         struct hrtimer *hr = &pmu->hrtimer;
 241
 242         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 243         hr->function = rapl_hrtimer_handle;
 244 }
 245
 246 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 247                                    struct perf_event *event)
 248 {
 249         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 250                 return;
 251
 252         event->hw.state = 0;
 253
 254         list_add_tail(&event->active_entry, &pmu->active_list);
 255
 256         local64_set(&event->hw.prev_count, rapl_read_counter(event));
 257
 258         pmu->n_active++;
 259         if (pmu->n_active == 1)
 260                 rapl_start_hrtimer(pmu);
 261 }
 262
 263 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 264 {
 265         struct rapl_pmu *pmu = event->pmu_private;
 266         unsigned long flags;
 267
 268         raw_spin_lock_irqsave(&pmu->lock, flags);
 269         __rapl_pmu_event_start(pmu, event);
 270         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 271 }
 272
 273 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 274 {
 275         struct rapl_pmu *pmu = event->pmu_private;
 276         struct hw_perf_event *hwc = &event->hw;
 277         unsigned long flags;
 278
 279         raw_spin_lock_irqsave(&pmu->lock, flags);
 280
 281         /* mark event as deactivated and stopped */
 282         if (!(hwc->state & PERF_HES_STOPPED)) {
 283                 WARN_ON_ONCE(pmu->n_active <= 0);
 284                 pmu->n_active--;
 285                 if (pmu->n_active == 0)
 286                         hrtimer_cancel(&pmu->hrtimer);
 287
 288                 list_del(&event->active_entry);
 289
 290                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 291                 hwc->state |= PERF_HES_STOPPED;
 292         }
 293
 294         /* check if update of sw counter is necessary */
 295         if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 296                 /*
 297                  * Drain the remaining delta count out of a event
 298                  * that we are disabling:
 299                  */
 300                 rapl_event_update(event);
 301                 hwc->state |= PERF_HES_UPTODATE;
 302         }
 303
 304         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 305 }
 306
 307 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 308 {
 309         struct rapl_pmu *pmu = event->pmu_private;
 310         struct hw_perf_event *hwc = &event->hw;
 311         unsigned long flags;
 312
 313         raw_spin_lock_irqsave(&pmu->lock, flags);
 314
 315         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 316
 317         if (mode & PERF_EF_START)
 318                 __rapl_pmu_event_start(pmu, event);
 319
 320         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 321
 322         return 0;
 323 }
 324
 325 static void rapl_pmu_event_del(struct perf_event *event, int flags)
 326 {
 327         rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 328 }
 329
 330 static int rapl_pmu_event_init(struct perf_event *event)
 331 {
 332         u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 333         int bit, ret = 0;
 334         struct rapl_pmu *pmu;
 335
 336         /* only look at RAPL events */
 337         if (event->attr.type != rapl_pmus->pmu.type)
 338                 return -ENOENT;
 339
 340         /* check only supported bits are set */
 341         if (event->attr.config & ~RAPL_EVENT_MASK)
 342                 return -EINVAL;
 343
 344         if (event->cpu < 0)
 345                 return -EINVAL;
 346
 347         event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 348
 349         if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
 350                 return -EINVAL;
 351
 352         cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
 353         bit = cfg - 1;
 354
 355         /* check event supported */
 356         if (!(rapl_cntr_mask & (1 << bit)))
 357                 return -EINVAL;
 358
 359         /* unsupported modes and filters */
 360         if (event->attr.sample_period) /* no sampling */
 361                 return -EINVAL;
 362
 363         /* must be done before validate_group */
 364         pmu = cpu_to_rapl_pmu(event->cpu);
 365         if (!pmu)
 366                 return -EINVAL;
 367         event->cpu = pmu->cpu;
 368         event->pmu_private = pmu;
 369         event->hw.event_base = rapl_msrs[bit].msr;
 370         event->hw.config = cfg;
 371         event->hw.idx = bit;
 372
 373         return ret;
 374 }
 375
 376 static void rapl_pmu_event_read(struct perf_event *event)
 377 {
 378         rapl_event_update(event);
 379 }
 380
 381 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 382                                 struct device_attribute *attr, char *buf)
 383 {
 384         return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 385 }
 386
 387 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 388
 389 static struct attribute *rapl_pmu_attrs[] = {
 390         &dev_attr_cpumask.attr,
 391         NULL,
 392 };
 393
 394 static struct attribute_group rapl_pmu_attr_group = {
 395         .attrs = rapl_pmu_attrs,
 396 };
 397
 398 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 399 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 400 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 401 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 402 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 403
 404 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 405 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 406 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 407 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 408 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 409
 410 /*
 411  * we compute in 0.23 nJ increments regardless of MSR
 412  */
 413 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 414 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 415 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 416 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 417 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 418
 419 /*
 420  * There are no default events, but we need to create
 421  * "events" group (with empty attrs) before updating
 422  * it with detected events.
 423  */
 424 static struct attribute *attrs_empty[] = {
 425         NULL,
 426 };
 427
 428 static struct attribute_group rapl_pmu_events_group = {
 429         .name = "events",
 430         .attrs = attrs_empty,
 431 };
 432
 433 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
 434 static struct attribute *rapl_formats_attr[] = {
 435         &format_attr_event.attr,
 436         NULL,
 437 };
 438
 439 static struct attribute_group rapl_pmu_format_group = {
 440         .name = "format",
 441         .attrs = rapl_formats_attr,
 442 };
 443
 444 static const struct attribute_group *rapl_attr_groups[] = {
 445         &rapl_pmu_attr_group,
 446         &rapl_pmu_format_group,
 447         &rapl_pmu_events_group,
 448         NULL,
 449 };
 450
 451 static struct attribute *rapl_events_cores[] = {
 452         EVENT_PTR(rapl_cores),
 453         EVENT_PTR(rapl_cores_unit),
 454         EVENT_PTR(rapl_cores_scale),
 455         NULL,
 456 };
 457
 458 static struct attribute_group rapl_events_cores_group = {
 459         .name  = "events",
 460         .attrs = rapl_events_cores,
 461 };
 462
 463 static struct attribute *rapl_events_pkg[] = {
 464         EVENT_PTR(rapl_pkg),
 465         EVENT_PTR(rapl_pkg_unit),
 466         EVENT_PTR(rapl_pkg_scale),
 467         NULL,
 468 };
 469
 470 static struct attribute_group rapl_events_pkg_group = {
 471         .name  = "events",
 472         .attrs = rapl_events_pkg,
 473 };
 474
 475 static struct attribute *rapl_events_ram[] = {
 476         EVENT_PTR(rapl_ram),
 477         EVENT_PTR(rapl_ram_unit),
 478         EVENT_PTR(rapl_ram_scale),
 479         NULL,
 480 };
 481
 482 static struct attribute_group rapl_events_ram_group = {
 483         .name  = "events",
 484         .attrs = rapl_events_ram,
 485 };
 486
 487 static struct attribute *rapl_events_gpu[] = {
 488         EVENT_PTR(rapl_gpu),
 489         EVENT_PTR(rapl_gpu_unit),
 490         EVENT_PTR(rapl_gpu_scale),
 491         NULL,
 492 };
 493
 494 static struct attribute_group rapl_events_gpu_group = {
 495         .name  = "events",
 496         .attrs = rapl_events_gpu,
 497 };
 498
 499 static struct attribute *rapl_events_psys[] = {
 500         EVENT_PTR(rapl_psys),
 501         EVENT_PTR(rapl_psys_unit),
 502         EVENT_PTR(rapl_psys_scale),
 503         NULL,
 504 };
 505
 506 static struct attribute_group rapl_events_psys_group = {
 507         .name  = "events",
 508         .attrs = rapl_events_psys,
 509 };
 510
 511 static bool test_msr(int idx, void *data)
 512 {
 513         return test_bit(idx, (unsigned long *) data);
 514 }
 515
 516 static struct perf_msr rapl_msrs[] = {
 517         [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
 518         [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
 519         [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
 520         [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr },
 521         [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr },
 522 };
 523
 524 static int rapl_cpu_offline(unsigned int cpu)
 525 {
 526         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 527         int target;
 528
 529         /* Check if exiting cpu is used for collecting rapl events */
 530         if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
 531                 return 0;
 532
 533         pmu->cpu = -1;
 534         /* Find a new cpu to collect rapl events */
 535         target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
 536
 537         /* Migrate rapl events to the new target */
 538         if (target < nr_cpu_ids) {
 539                 cpumask_set_cpu(target, &rapl_cpu_mask);
 540                 pmu->cpu = target;
 541                 perf_pmu_migrate_context(pmu->pmu, cpu, target);
 542         }
 543         return 0;
 544 }
 545
 546 static int rapl_cpu_online(unsigned int cpu)
 547 {
 548         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 549         int target;
 550
 551         if (!pmu) {
 552                 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 553                 if (!pmu)
 554                         return -ENOMEM;
 555
 556                 raw_spin_lock_init(&pmu->lock);
 557                 INIT_LIST_HEAD(&pmu->active_list);
 558                 pmu->pmu = &rapl_pmus->pmu;
 559                 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 560                 rapl_hrtimer_init(pmu);
 561
 562                 rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
 563         }
 564
 565         /*
 566          * Check if there is an online cpu in the package which collects rapl
 567          * events already.
 568          */
 569         target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
 570         if (target < nr_cpu_ids)
 571                 return 0;
 572
 573         cpumask_set_cpu(cpu, &rapl_cpu_mask);
 574         pmu->cpu = cpu;
 575         return 0;
 576 }
 577
 578 static int rapl_check_hw_unit(bool apply_quirk)
 579 {
 580         u64 msr_rapl_power_unit_bits;
 581         int i;
 582
 583         /* protect rdmsrl() to handle virtualization */
 584         if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
 585                 return -1;
 586         for (i = 0; i < NR_RAPL_DOMAINS; i++)
 587                 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 588
 589         /*
 590          * DRAM domain on HSW server and KNL has fixed energy unit which can be
 591          * different than the unit from power unit MSR. See
 592          * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 593          * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 594          */
 595         if (apply_quirk)
 596                 rapl_hw_unit[PERF_RAPL_RAM] = 16;
 597
 598         /*
 599          * Calculate the timer rate:
 600          * Use reference of 200W for scaling the timeout to avoid counter
 601          * overflows. 200W = 200 Joules/sec
 602          * Divide interval by 2 to avoid lockstep (2 * 100)
 603          * if hw unit is 32, then we use 2 ms 1/200/2
 604          */
 605         rapl_timer_ms = 2;
 606         if (rapl_hw_unit[0] < 32) {
 607                 rapl_timer_ms = (1000 / (2 * 100));
 608                 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
 609         }
 610         return 0;
 611 }
 612
 613 static void __init rapl_advertise(void)
 614 {
 615         int i;
 616
 617         pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
 618                 hweight32(rapl_cntr_mask), rapl_timer_ms);
 619
 620         for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 621                 if (rapl_cntr_mask & (1 << i)) {
 622                         pr_info("hw unit of domain %s 2^-%d Joules\n",
 623                                 rapl_domain_names[i], rapl_hw_unit[i]);
 624                 }
 625         }
 626 }
 627
 628 static void cleanup_rapl_pmus(void)
 629 {
 630         int i;
 631
 632         for (i = 0; i < rapl_pmus->maxdie; i++)
 633                 kfree(rapl_pmus->pmus[i]);
 634         kfree(rapl_pmus);
 635 }
 636
 637 static const struct attribute_group *rapl_attr_update[] = {
 638         &rapl_events_cores_group,
 639         &rapl_events_pkg_group,
 640         &rapl_events_ram_group,
 641         &rapl_events_gpu_group,
 642         &rapl_events_gpu_group,
 643         NULL,
 644 };
 645
 646 static int __init init_rapl_pmus(void)
 647 {
 648         int maxdie = topology_max_packages() * topology_max_die_per_package();
 649         size_t size;
 650
 651         size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
 652         rapl_pmus = kzalloc(size, GFP_KERNEL);
 653         if (!rapl_pmus)
 654                 return -ENOMEM;
 655
 656         rapl_pmus->maxdie               = maxdie;
 657         rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
 658         rapl_pmus->pmu.attr_update      = rapl_attr_update;
 659         rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
 660         rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
 661         rapl_pmus->pmu.add              = rapl_pmu_event_add;
 662         rapl_pmus->pmu.del              = rapl_pmu_event_del;
 663         rapl_pmus->pmu.start            = rapl_pmu_event_start;
 664         rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
 665         rapl_pmus->pmu.read             = rapl_pmu_event_read;
 666         rapl_pmus->pmu.module           = THIS_MODULE;
 667         rapl_pmus->pmu.capabilities     = PERF_PMU_CAP_NO_EXCLUDE;
 668         return 0;
 669 }
 670
 671 #define X86_RAPL_MODEL_MATCH(model, init)       \
 672         { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
 673
 674 static struct rapl_model model_snb = {
 675         .events         = BIT(PERF_RAPL_PP0) |
 676                           BIT(PERF_RAPL_PKG) |
 677                           BIT(PERF_RAPL_PP1),
 678         .apply_quirk    = false,
 679 };
 680
 681 static struct rapl_model model_snbep = {
 682         .events         = BIT(PERF_RAPL_PP0) |
 683                           BIT(PERF_RAPL_PKG) |
 684                           BIT(PERF_RAPL_RAM),
 685         .apply_quirk    = false,
 686 };
 687
 688 static struct rapl_model model_hsw = {
 689         .events         = BIT(PERF_RAPL_PP0) |
 690                           BIT(PERF_RAPL_PKG) |
 691                           BIT(PERF_RAPL_RAM) |
 692                           BIT(PERF_RAPL_PP1),
 693         .apply_quirk    = false,
 694 };
 695
 696 static struct rapl_model model_hsx = {
 697         .events         = BIT(PERF_RAPL_PP0) |
 698                           BIT(PERF_RAPL_PKG) |
 699                           BIT(PERF_RAPL_RAM),
 700         .apply_quirk    = true,
 701 };
 702
 703 static struct rapl_model model_knl = {
 704         .events         = BIT(PERF_RAPL_PKG) |
 705                           BIT(PERF_RAPL_RAM),
 706         .apply_quirk    = true,
 707 };
 708
 709 static struct rapl_model model_skl = {
 710         .events         = BIT(PERF_RAPL_PP0) |
 711                           BIT(PERF_RAPL_PKG) |
 712                           BIT(PERF_RAPL_RAM) |
 713                           BIT(PERF_RAPL_PP1) |
 714                           BIT(PERF_RAPL_PSYS),
 715         .apply_quirk    = false,
 716 };
 717
 718 static const struct x86_cpu_id rapl_model_match[] __initconst = {
 719         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,            model_snb),
 720         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,          model_snbep),
 721         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,              model_snb),
 722         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X,            model_snbep),
 723         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL,                model_hsw),
 724         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,              model_hsx),
 725         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L,              model_hsw),
 726         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G,              model_hsw),
 727         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL,              model_hsw),
 728         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G,            model_hsw),
 729         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,            model_hsx),
 730         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D,            model_hsx),
 731         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,           model_knl),
 732         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM,           model_knl),
 733         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L,              model_skl),
 734         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE,                model_skl),
 735         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,              model_hsx),
 736         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L,             model_skl),
 737         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE,               model_skl),
 738         X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L,           model_skl),
 739         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT,          model_hsw),
 740         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D,        model_hsw),
 741         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,     model_hsw),
 742         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L,              model_skl),
 743         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE,                model_skl),
 744         X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L,            model_skl),
 745         X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE,              model_skl),
 746         {},
 747 };
 748
 749 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
 750
 751 static int __init rapl_pmu_init(void)
 752 {
 753         const struct x86_cpu_id *id;
 754         struct rapl_model *rm;
 755         int ret;
 756
 757         id = x86_match_cpu(rapl_model_match);
 758         if (!id)
 759                 return -ENODEV;
 760
 761         rm = (struct rapl_model *) id->driver_data;
 762         rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
 763                                         false, (void *) &rm->events);
 764
 765         ret = rapl_check_hw_unit(rm->apply_quirk);
 766         if (ret)
 767                 return ret;
 768
 769         ret = init_rapl_pmus();
 770         if (ret)
 771                 return ret;
 772
 773         /*
 774          * Install callbacks. Core will call them for each online cpu.
 775          */
 776         ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
 777                                 "perf/x86/rapl:online",
 778                                 rapl_cpu_online, rapl_cpu_offline);
 779         if (ret)
 780                 goto out;
 781
 782         ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 783         if (ret)
 784                 goto out1;
 785
 786         rapl_advertise();
 787         return 0;
 788
 789 out1:
 790         cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 791 out:
 792         pr_warn("Initialization failed (%d), disabled\n", ret);
 793         cleanup_rapl_pmus();
 794         return ret;
 795 }
 796 module_init(rapl_pmu_init);
 797
 798 static void __exit intel_rapl_exit(void)
 799 {
 800         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 801         perf_pmu_unregister(&rapl_pmus->pmu);
 802         cleanup_rapl_pmus();
 803 }
 804 module_exit(intel_rapl_exit);