drivers/perf/arm_spe_pmu.c

   1 /*
   2  * Perf support for the Statistical Profiling Extension, introduced as
   3  * part of ARMv8.2.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License version 2 as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16  *
  17  * Copyright (C) 2016 ARM Limited
  18  *
  19  * Author: Will Deacon <will.deacon@arm.com>
  20  */
  21
  22 #define PMUNAME                                 "arm_spe"
  23 #define DRVNAME                                 PMUNAME "_pmu"
  24 #define pr_fmt(fmt)                             DRVNAME ": " fmt
  25
  26 #include <linux/bitops.h>
  27 #include <linux/bug.h>
  28 #include <linux/capability.h>
  29 #include <linux/cpuhotplug.h>
  30 #include <linux/cpumask.h>
  31 #include <linux/device.h>
  32 #include <linux/errno.h>
  33 #include <linux/interrupt.h>
  34 #include <linux/irq.h>
  35 #include <linux/kernel.h>
  36 #include <linux/list.h>
  37 #include <linux/module.h>
  38 #include <linux/of_address.h>
  39 #include <linux/of_device.h>
  40 #include <linux/perf_event.h>
  41 #include <linux/platform_device.h>
  42 #include <linux/printk.h>
  43 #include <linux/slab.h>
  44 #include <linux/smp.h>
  45 #include <linux/vmalloc.h>
  46
  47 #include <asm/barrier.h>
  48 #include <asm/cpufeature.h>
  49 #include <asm/mmu.h>
  50 #include <asm/sysreg.h>
  51
  52 #define ARM_SPE_BUF_PAD_BYTE                    0
  53
  54 struct arm_spe_pmu_buf {
  55         int                                     nr_pages;
  56         bool                                    snapshot;
  57         void                                    *base;
  58 };
  59
  60 struct arm_spe_pmu {
  61         struct pmu                              pmu;
  62         struct platform_device                  *pdev;
  63         cpumask_t                               supported_cpus;
  64         struct hlist_node                       hotplug_node;
  65
  66         int                                     irq; /* PPI */
  67
  68         u16                                     min_period;
  69         u16                                     counter_sz;
  70
  71 #define SPE_PMU_FEAT_FILT_EVT                   (1UL << 0)
  72 #define SPE_PMU_FEAT_FILT_TYP                   (1UL << 1)
  73 #define SPE_PMU_FEAT_FILT_LAT                   (1UL << 2)
  74 #define SPE_PMU_FEAT_ARCH_INST                  (1UL << 3)
  75 #define SPE_PMU_FEAT_LDS                        (1UL << 4)
  76 #define SPE_PMU_FEAT_ERND                       (1UL << 5)
  77 #define SPE_PMU_FEAT_DEV_PROBED                 (1UL << 63)
  78         u64                                     features;
  79
  80         u16                                     max_record_sz;
  81         u16                                     align;
  82         struct perf_output_handle __percpu      *handle;
  83 };
  84
  85 #define to_spe_pmu(p) (container_of(p, struct arm_spe_pmu, pmu))
  86
  87 /* Convert a free-running index from perf into an SPE buffer offset */
  88 #define PERF_IDX2OFF(idx, buf)  ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
  89
  90 /* Keep track of our dynamic hotplug state */
  91 static enum cpuhp_state arm_spe_pmu_online;
  92
  93 enum arm_spe_pmu_buf_fault_action {
  94         SPE_PMU_BUF_FAULT_ACT_SPURIOUS,
  95         SPE_PMU_BUF_FAULT_ACT_FATAL,
  96         SPE_PMU_BUF_FAULT_ACT_OK,
  97 };
  98
  99 /* This sysfs gunk was really good fun to write. */
 100 enum arm_spe_pmu_capabilities {
 101         SPE_PMU_CAP_ARCH_INST = 0,
 102         SPE_PMU_CAP_ERND,
 103         SPE_PMU_CAP_FEAT_MAX,
 104         SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
 105         SPE_PMU_CAP_MIN_IVAL,
 106 };
 107
 108 static int arm_spe_pmu_feat_caps[SPE_PMU_CAP_FEAT_MAX] = {
 109         [SPE_PMU_CAP_ARCH_INST] = SPE_PMU_FEAT_ARCH_INST,
 110         [SPE_PMU_CAP_ERND]      = SPE_PMU_FEAT_ERND,
 111 };
 112
 113 static u32 arm_spe_pmu_cap_get(struct arm_spe_pmu *spe_pmu, int cap)
 114 {
 115         if (cap < SPE_PMU_CAP_FEAT_MAX)
 116                 return !!(spe_pmu->features & arm_spe_pmu_feat_caps[cap]);
 117
 118         switch (cap) {
 119         case SPE_PMU_CAP_CNT_SZ:
 120                 return spe_pmu->counter_sz;
 121         case SPE_PMU_CAP_MIN_IVAL:
 122                 return spe_pmu->min_period;
 123         default:
 124                 WARN(1, "unknown cap %d\n", cap);
 125         }
 126
 127         return 0;
 128 }
 129
 130 static ssize_t arm_spe_pmu_cap_show(struct device *dev,
 131                                     struct device_attribute *attr,
 132                                     char *buf)
 133 {
 134         struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev);
 135         struct dev_ext_attribute *ea =
 136                 container_of(attr, struct dev_ext_attribute, attr);
 137         int cap = (long)ea->var;
 138
 139         return snprintf(buf, PAGE_SIZE, "%u\n",
 140                 arm_spe_pmu_cap_get(spe_pmu, cap));
 141 }
 142
 143 #define SPE_EXT_ATTR_ENTRY(_name, _func, _var)                          \
 144         &((struct dev_ext_attribute[]) {                                \
 145                 { __ATTR(_name, S_IRUGO, _func, NULL), (void *)_var }   \
 146         })[0].attr.attr
 147
 148 #define SPE_CAP_EXT_ATTR_ENTRY(_name, _var)                             \
 149         SPE_EXT_ATTR_ENTRY(_name, arm_spe_pmu_cap_show, _var)
 150
 151 static struct attribute *arm_spe_pmu_cap_attr[] = {
 152         SPE_CAP_EXT_ATTR_ENTRY(arch_inst, SPE_PMU_CAP_ARCH_INST),
 153         SPE_CAP_EXT_ATTR_ENTRY(ernd, SPE_PMU_CAP_ERND),
 154         SPE_CAP_EXT_ATTR_ENTRY(count_size, SPE_PMU_CAP_CNT_SZ),
 155         SPE_CAP_EXT_ATTR_ENTRY(min_interval, SPE_PMU_CAP_MIN_IVAL),
 156         NULL,
 157 };
 158
 159 static struct attribute_group arm_spe_pmu_cap_group = {
 160         .name   = "caps",
 161         .attrs  = arm_spe_pmu_cap_attr,
 162 };
 163
 164 /* User ABI */
 165 #define ATTR_CFG_FLD_ts_enable_CFG              config  /* PMSCR_EL1.TS */
 166 #define ATTR_CFG_FLD_ts_enable_LO               0
 167 #define ATTR_CFG_FLD_ts_enable_HI               0
 168 #define ATTR_CFG_FLD_pa_enable_CFG              config  /* PMSCR_EL1.PA */
 169 #define ATTR_CFG_FLD_pa_enable_LO               1
 170 #define ATTR_CFG_FLD_pa_enable_HI               1
 171 #define ATTR_CFG_FLD_pct_enable_CFG             config  /* PMSCR_EL1.PCT */
 172 #define ATTR_CFG_FLD_pct_enable_LO              2
 173 #define ATTR_CFG_FLD_pct_enable_HI              2
 174 #define ATTR_CFG_FLD_jitter_CFG                 config  /* PMSIRR_EL1.RND */
 175 #define ATTR_CFG_FLD_jitter_LO                  16
 176 #define ATTR_CFG_FLD_jitter_HI                  16
 177 #define ATTR_CFG_FLD_branch_filter_CFG          config  /* PMSFCR_EL1.B */
 178 #define ATTR_CFG_FLD_branch_filter_LO           32
 179 #define ATTR_CFG_FLD_branch_filter_HI           32
 180 #define ATTR_CFG_FLD_load_filter_CFG            config  /* PMSFCR_EL1.LD */
 181 #define ATTR_CFG_FLD_load_filter_LO             33
 182 #define ATTR_CFG_FLD_load_filter_HI             33
 183 #define ATTR_CFG_FLD_store_filter_CFG           config  /* PMSFCR_EL1.ST */
 184 #define ATTR_CFG_FLD_store_filter_LO            34
 185 #define ATTR_CFG_FLD_store_filter_HI            34
 186
 187 #define ATTR_CFG_FLD_event_filter_CFG           config1 /* PMSEVFR_EL1 */
 188 #define ATTR_CFG_FLD_event_filter_LO            0
 189 #define ATTR_CFG_FLD_event_filter_HI            63
 190
 191 #define ATTR_CFG_FLD_min_latency_CFG            config2 /* PMSLATFR_EL1.MINLAT */
 192 #define ATTR_CFG_FLD_min_latency_LO             0
 193 #define ATTR_CFG_FLD_min_latency_HI             11
 194
 195 /* Why does everything I do descend into this? */
 196 #define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)                              \
 197         (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi
 198
 199 #define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi)                               \
 200         __GEN_PMU_FORMAT_ATTR(cfg, lo, hi)
 201
 202 #define GEN_PMU_FORMAT_ATTR(name)                                       \
 203         PMU_FORMAT_ATTR(name,                                           \
 204         _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG,                 \
 205                              ATTR_CFG_FLD_##name##_LO,                  \
 206                              ATTR_CFG_FLD_##name##_HI))
 207
 208 #define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi)                            \
 209         ((((attr)->cfg) >> lo) & GENMASK(hi - lo, 0))
 210
 211 #define ATTR_CFG_GET_FLD(attr, name)                                    \
 212         _ATTR_CFG_GET_FLD(attr,                                         \
 213                           ATTR_CFG_FLD_##name##_CFG,                    \
 214                           ATTR_CFG_FLD_##name##_LO,                     \
 215                           ATTR_CFG_FLD_##name##_HI)
 216
 217 GEN_PMU_FORMAT_ATTR(ts_enable);
 218 GEN_PMU_FORMAT_ATTR(pa_enable);
 219 GEN_PMU_FORMAT_ATTR(pct_enable);
 220 GEN_PMU_FORMAT_ATTR(jitter);
 221 GEN_PMU_FORMAT_ATTR(branch_filter);
 222 GEN_PMU_FORMAT_ATTR(load_filter);
 223 GEN_PMU_FORMAT_ATTR(store_filter);
 224 GEN_PMU_FORMAT_ATTR(event_filter);
 225 GEN_PMU_FORMAT_ATTR(min_latency);
 226
 227 static struct attribute *arm_spe_pmu_formats_attr[] = {
 228         &format_attr_ts_enable.attr,
 229         &format_attr_pa_enable.attr,
 230         &format_attr_pct_enable.attr,
 231         &format_attr_jitter.attr,
 232         &format_attr_branch_filter.attr,
 233         &format_attr_load_filter.attr,
 234         &format_attr_store_filter.attr,
 235         &format_attr_event_filter.attr,
 236         &format_attr_min_latency.attr,
 237         NULL,
 238 };
 239
 240 static struct attribute_group arm_spe_pmu_format_group = {
 241         .name   = "format",
 242         .attrs  = arm_spe_pmu_formats_attr,
 243 };
 244
 245 static ssize_t arm_spe_pmu_get_attr_cpumask(struct device *dev,
 246                                             struct device_attribute *attr,
 247                                             char *buf)
 248 {
 249         struct arm_spe_pmu *spe_pmu = dev_get_drvdata(dev);
 250
 251         return cpumap_print_to_pagebuf(true, buf, &spe_pmu->supported_cpus);
 252 }
 253 static DEVICE_ATTR(cpumask, S_IRUGO, arm_spe_pmu_get_attr_cpumask, NULL);
 254
 255 static struct attribute *arm_spe_pmu_attrs[] = {
 256         &dev_attr_cpumask.attr,
 257         NULL,
 258 };
 259
 260 static struct attribute_group arm_spe_pmu_group = {
 261         .attrs  = arm_spe_pmu_attrs,
 262 };
 263
 264 static const struct attribute_group *arm_spe_pmu_attr_groups[] = {
 265         &arm_spe_pmu_group,
 266         &arm_spe_pmu_cap_group,
 267         &arm_spe_pmu_format_group,
 268         NULL,
 269 };
 270
 271 /* Convert between user ABI and register values */
 272 static u64 arm_spe_event_to_pmscr(struct perf_event *event)
 273 {
 274         struct perf_event_attr *attr = &event->attr;
 275         u64 reg = 0;
 276
 277         reg |= ATTR_CFG_GET_FLD(attr, ts_enable) << SYS_PMSCR_EL1_TS_SHIFT;
 278         reg |= ATTR_CFG_GET_FLD(attr, pa_enable) << SYS_PMSCR_EL1_PA_SHIFT;
 279         reg |= ATTR_CFG_GET_FLD(attr, pct_enable) << SYS_PMSCR_EL1_PCT_SHIFT;
 280
 281         if (!attr->exclude_user)
 282                 reg |= BIT(SYS_PMSCR_EL1_E0SPE_SHIFT);
 283
 284         if (!attr->exclude_kernel)
 285                 reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
 286
 287         if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && capable(CAP_SYS_ADMIN))
 288                 reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);
 289
 290         return reg;
 291 }
 292
 293 static void arm_spe_event_sanitise_period(struct perf_event *event)
 294 {
 295         struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
 296         u64 period = event->hw.sample_period;
 297         u64 max_period = SYS_PMSIRR_EL1_INTERVAL_MASK
 298                          << SYS_PMSIRR_EL1_INTERVAL_SHIFT;
 299
 300         if (period < spe_pmu->min_period)
 301                 period = spe_pmu->min_period;
 302         else if (period > max_period)
 303                 period = max_period;
 304         else
 305                 period &= max_period;
 306
 307         event->hw.sample_period = period;
 308 }
 309
 310 static u64 arm_spe_event_to_pmsirr(struct perf_event *event)
 311 {
 312         struct perf_event_attr *attr = &event->attr;
 313         u64 reg = 0;
 314
 315         arm_spe_event_sanitise_period(event);
 316
 317         reg |= ATTR_CFG_GET_FLD(attr, jitter) << SYS_PMSIRR_EL1_RND_SHIFT;
 318         reg |= event->hw.sample_period;
 319
 320         return reg;
 321 }
 322
 323 static u64 arm_spe_event_to_pmsfcr(struct perf_event *event)
 324 {
 325         struct perf_event_attr *attr = &event->attr;
 326         u64 reg = 0;
 327
 328         reg |= ATTR_CFG_GET_FLD(attr, load_filter) << SYS_PMSFCR_EL1_LD_SHIFT;
 329         reg |= ATTR_CFG_GET_FLD(attr, store_filter) << SYS_PMSFCR_EL1_ST_SHIFT;
 330         reg |= ATTR_CFG_GET_FLD(attr, branch_filter) << SYS_PMSFCR_EL1_B_SHIFT;
 331
 332         if (reg)
 333                 reg |= BIT(SYS_PMSFCR_EL1_FT_SHIFT);
 334
 335         if (ATTR_CFG_GET_FLD(attr, event_filter))
 336                 reg |= BIT(SYS_PMSFCR_EL1_FE_SHIFT);
 337
 338         if (ATTR_CFG_GET_FLD(attr, min_latency))
 339                 reg |= BIT(SYS_PMSFCR_EL1_FL_SHIFT);
 340
 341         return reg;
 342 }
 343
 344 static u64 arm_spe_event_to_pmsevfr(struct perf_event *event)
 345 {
 346         struct perf_event_attr *attr = &event->attr;
 347         return ATTR_CFG_GET_FLD(attr, event_filter);
 348 }
 349
 350 static u64 arm_spe_event_to_pmslatfr(struct perf_event *event)
 351 {
 352         struct perf_event_attr *attr = &event->attr;
 353         return ATTR_CFG_GET_FLD(attr, min_latency)
 354                << SYS_PMSLATFR_EL1_MINLAT_SHIFT;
 355 }
 356
 357 static void arm_spe_pmu_pad_buf(struct perf_output_handle *handle, int len)
 358 {
 359         struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
 360         u64 head = PERF_IDX2OFF(handle->head, buf);
 361
 362         memset(buf->base + head, ARM_SPE_BUF_PAD_BYTE, len);
 363         if (!buf->snapshot)
 364                 perf_aux_output_skip(handle, len);
 365 }
 366
 367 static u64 arm_spe_pmu_next_snapshot_off(struct perf_output_handle *handle)
 368 {
 369         struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
 370         struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
 371         u64 head = PERF_IDX2OFF(handle->head, buf);
 372         u64 limit = buf->nr_pages * PAGE_SIZE;
 373
 374         /*
 375          * The trace format isn't parseable in reverse, so clamp
 376          * the limit to half of the buffer size in snapshot mode
 377          * so that the worst case is half a buffer of records, as
 378          * opposed to a single record.
 379          */
 380         if (head < limit >> 1)
 381                 limit >>= 1;
 382
 383         /*
 384          * If we're within max_record_sz of the limit, we must
 385          * pad, move the head index and recompute the limit.
 386          */
 387         if (limit - head < spe_pmu->max_record_sz) {
 388                 arm_spe_pmu_pad_buf(handle, limit - head);
 389                 handle->head = PERF_IDX2OFF(limit, buf);
 390                 limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
 391         }
 392
 393         return limit;
 394 }
 395
 396 static u64 __arm_spe_pmu_next_off(struct perf_output_handle *handle)
 397 {
 398         struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
 399         struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
 400         const u64 bufsize = buf->nr_pages * PAGE_SIZE;
 401         u64 limit = bufsize;
 402         u64 head, tail, wakeup;
 403
 404         /*
 405          * The head can be misaligned for two reasons:
 406          *
 407          * 1. The hardware left PMBPTR pointing to the first byte after
 408          *    a record when generating a buffer management event.
 409          *
 410          * 2. We used perf_aux_output_skip to consume handle->size bytes
 411          *    and CIRC_SPACE was used to compute the size, which always
 412          *    leaves one entry free.
 413          *
 414          * Deal with this by padding to the next alignment boundary and
 415          * moving the head index. If we run out of buffer space, we'll
 416          * reduce handle->size to zero and end up reporting truncation.
 417          */
 418         head = PERF_IDX2OFF(handle->head, buf);
 419         if (!IS_ALIGNED(head, spe_pmu->align)) {
 420                 unsigned long delta = roundup(head, spe_pmu->align) - head;
 421
 422                 delta = min(delta, handle->size);
 423                 arm_spe_pmu_pad_buf(handle, delta);
 424                 head = PERF_IDX2OFF(handle->head, buf);
 425         }
 426
 427         /* If we've run out of free space, then nothing more to do */
 428         if (!handle->size)
 429                 goto no_space;
 430
 431         /* Compute the tail and wakeup indices now that we've aligned head */
 432         tail = PERF_IDX2OFF(handle->head + handle->size, buf);
 433         wakeup = PERF_IDX2OFF(handle->wakeup, buf);
 434
 435         /*
 436          * Avoid clobbering unconsumed data. We know we have space, so
 437          * if we see head == tail we know that the buffer is empty. If
 438          * head > tail, then there's nothing to clobber prior to
 439          * wrapping.
 440          */
 441         if (head < tail)
 442                 limit = round_down(tail, PAGE_SIZE);
 443
 444         /*
 445          * Wakeup may be arbitrarily far into the future. If it's not in
 446          * the current generation, either we'll wrap before hitting it,
 447          * or it's in the past and has been handled already.
 448          *
 449          * If there's a wakeup before we wrap, arrange to be woken up by
 450          * the page boundary following it. Keep the tail boundary if
 451          * that's lower.
 452          */
 453         if (handle->wakeup < (handle->head + handle->size) && head <= wakeup)
 454                 limit = min(limit, round_up(wakeup, PAGE_SIZE));
 455
 456         if (limit > head)
 457                 return limit;
 458
 459         arm_spe_pmu_pad_buf(handle, handle->size);
 460 no_space:
 461         perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 462         perf_aux_output_end(handle, 0);
 463         return 0;
 464 }
 465
 466 static u64 arm_spe_pmu_next_off(struct perf_output_handle *handle)
 467 {
 468         struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
 469         struct arm_spe_pmu *spe_pmu = to_spe_pmu(handle->event->pmu);
 470         u64 limit = __arm_spe_pmu_next_off(handle);
 471         u64 head = PERF_IDX2OFF(handle->head, buf);
 472
 473         /*
 474          * If the head has come too close to the end of the buffer,
 475          * then pad to the end and recompute the limit.
 476          */
 477         if (limit && (limit - head < spe_pmu->max_record_sz)) {
 478                 arm_spe_pmu_pad_buf(handle, limit - head);
 479                 limit = __arm_spe_pmu_next_off(handle);
 480         }
 481
 482         return limit;
 483 }
 484
 485 static void arm_spe_perf_aux_output_begin(struct perf_output_handle *handle,
 486                                           struct perf_event *event)
 487 {
 488         u64 base, limit;
 489         struct arm_spe_pmu_buf *buf;
 490
 491         /* Start a new aux session */
 492         buf = perf_aux_output_begin(handle, event);
 493         if (!buf) {
 494                 event->hw.state |= PERF_HES_STOPPED;
 495                 /*
 496                  * We still need to clear the limit pointer, since the
 497                  * profiler might only be disabled by virtue of a fault.
 498                  */
 499                 limit = 0;
 500                 goto out_write_limit;
 501         }
 502
 503         limit = buf->snapshot ? arm_spe_pmu_next_snapshot_off(handle)
 504                               : arm_spe_pmu_next_off(handle);
 505         if (limit)
 506                 limit |= BIT(SYS_PMBLIMITR_EL1_E_SHIFT);
 507
 508         limit += (u64)buf->base;
 509         base = (u64)buf->base + PERF_IDX2OFF(handle->head, buf);
 510         write_sysreg_s(base, SYS_PMBPTR_EL1);
 511
 512 out_write_limit:
 513         write_sysreg_s(limit, SYS_PMBLIMITR_EL1);
 514 }
 515
 516 static void arm_spe_perf_aux_output_end(struct perf_output_handle *handle)
 517 {
 518         struct arm_spe_pmu_buf *buf = perf_get_aux(handle);
 519         u64 offset, size;
 520
 521         offset = read_sysreg_s(SYS_PMBPTR_EL1) - (u64)buf->base;
 522         size = offset - PERF_IDX2OFF(handle->head, buf);
 523
 524         if (buf->snapshot)
 525                 handle->head = offset;
 526
 527         perf_aux_output_end(handle, size);
 528 }
 529
 530 static void arm_spe_pmu_disable_and_drain_local(void)
 531 {
 532         /* Disable profiling at EL0 and EL1 */
 533         write_sysreg_s(0, SYS_PMSCR_EL1);
 534         isb();
 535
 536         /* Drain any buffered data */
 537         psb_csync();
 538         dsb(nsh);
 539
 540         /* Disable the profiling buffer */
 541         write_sysreg_s(0, SYS_PMBLIMITR_EL1);
 542         isb();
 543 }
 544
 545 /* IRQ handling */
 546 static enum arm_spe_pmu_buf_fault_action
 547 arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle)
 548 {
 549         const char *err_str;
 550         u64 pmbsr;
 551         enum arm_spe_pmu_buf_fault_action ret;
 552
 553         /*
 554          * Ensure new profiling data is visible to the CPU and any external
 555          * aborts have been resolved.
 556          */
 557         psb_csync();
 558         dsb(nsh);
 559
 560         /* Ensure hardware updates to PMBPTR_EL1 are visible */
 561         isb();
 562
 563         /* Service required? */
 564         pmbsr = read_sysreg_s(SYS_PMBSR_EL1);
 565         if (!(pmbsr & BIT(SYS_PMBSR_EL1_S_SHIFT)))
 566                 return SPE_PMU_BUF_FAULT_ACT_SPURIOUS;
 567
 568         /*
 569          * If we've lost data, disable profiling and also set the PARTIAL
 570          * flag to indicate that the last record is corrupted.
 571          */
 572         if (pmbsr & BIT(SYS_PMBSR_EL1_DL_SHIFT))
 573                 perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED |
 574                                              PERF_AUX_FLAG_PARTIAL);
 575
 576         /* Report collisions to userspace so that it can up the period */
 577         if (pmbsr & BIT(SYS_PMBSR_EL1_COLL_SHIFT))
 578                 perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
 579
 580         /* We only expect buffer management events */
 581         switch (pmbsr & (SYS_PMBSR_EL1_EC_MASK << SYS_PMBSR_EL1_EC_SHIFT)) {
 582         case SYS_PMBSR_EL1_EC_BUF:
 583                 /* Handled below */
 584                 break;
 585         case SYS_PMBSR_EL1_EC_FAULT_S1:
 586         case SYS_PMBSR_EL1_EC_FAULT_S2:
 587                 err_str = "Unexpected buffer fault";
 588                 goto out_err;
 589         default:
 590                 err_str = "Unknown error code";
 591                 goto out_err;
 592         }
 593
 594         /* Buffer management event */
 595         switch (pmbsr &
 596                 (SYS_PMBSR_EL1_BUF_BSC_MASK << SYS_PMBSR_EL1_BUF_BSC_SHIFT)) {
 597         case SYS_PMBSR_EL1_BUF_BSC_FULL:
 598                 ret = SPE_PMU_BUF_FAULT_ACT_OK;
 599                 goto out_stop;
 600         default:
 601                 err_str = "Unknown buffer status code";
 602         }
 603
 604 out_err:
 605         pr_err_ratelimited("%s on CPU %d [PMBSR=0x%016llx, PMBPTR=0x%016llx, PMBLIMITR=0x%016llx]\n",
 606                            err_str, smp_processor_id(), pmbsr,
 607                            read_sysreg_s(SYS_PMBPTR_EL1),
 608                            read_sysreg_s(SYS_PMBLIMITR_EL1));
 609         ret = SPE_PMU_BUF_FAULT_ACT_FATAL;
 610
 611 out_stop:
 612         arm_spe_perf_aux_output_end(handle);
 613         return ret;
 614 }
 615
 616 static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
 617 {
 618         struct perf_output_handle *handle = dev;
 619         struct perf_event *event = handle->event;
 620         enum arm_spe_pmu_buf_fault_action act;
 621
 622         if (!perf_get_aux(handle))
 623                 return IRQ_NONE;
 624
 625         act = arm_spe_pmu_buf_get_fault_act(handle);
 626         if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS)
 627                 return IRQ_NONE;
 628
 629         /*
 630          * Ensure perf callbacks have completed, which may disable the
 631          * profiling buffer in response to a TRUNCATION flag.
 632          */
 633         irq_work_run();
 634
 635         switch (act) {
 636         case SPE_PMU_BUF_FAULT_ACT_FATAL:
 637                 /*
 638                  * If a fatal exception occurred then leaving the profiling
 639                  * buffer enabled is a recipe waiting to happen. Since
 640                  * fatal faults don't always imply truncation, make sure
 641                  * that the profiling buffer is disabled explicitly before
 642                  * clearing the syndrome register.
 643                  */
 644                 arm_spe_pmu_disable_and_drain_local();
 645                 break;
 646         case SPE_PMU_BUF_FAULT_ACT_OK:
 647                 /*
 648                  * We handled the fault (the buffer was full), so resume
 649                  * profiling as long as we didn't detect truncation.
 650                  * PMBPTR might be misaligned, but we'll burn that bridge
 651                  * when we get to it.
 652                  */
 653                 if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) {
 654                         arm_spe_perf_aux_output_begin(handle, event);
 655                         isb();
 656                 }
 657                 break;
 658         case SPE_PMU_BUF_FAULT_ACT_SPURIOUS:
 659                 /* We've seen you before, but GCC has the memory of a sieve. */
 660                 break;
 661         }
 662
 663         /* The buffer pointers are now sane, so resume profiling. */
 664         write_sysreg_s(0, SYS_PMBSR_EL1);
 665         return IRQ_HANDLED;
 666 }
 667
 668 /* Perf callbacks */
 669 static int arm_spe_pmu_event_init(struct perf_event *event)
 670 {
 671         u64 reg;
 672         struct perf_event_attr *attr = &event->attr;
 673         struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
 674
 675         /* This is, of course, deeply driver-specific */
 676         if (attr->type != event->pmu->type)
 677                 return -ENOENT;
 678
 679         if (event->cpu >= 0 &&
 680             !cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus))
 681                 return -ENOENT;
 682
 683         if (arm_spe_event_to_pmsevfr(event) & SYS_PMSEVFR_EL1_RES0)
 684                 return -EOPNOTSUPP;
 685
 686         if (attr->exclude_idle)
 687                 return -EOPNOTSUPP;
 688
 689         /*
 690          * Feedback-directed frequency throttling doesn't work when we
 691          * have a buffer of samples. We'd need to manually count the
 692          * samples in the buffer when it fills up and adjust the event
 693          * count to reflect that. Instead, just force the user to specify
 694          * a sample period.
 695          */
 696         if (attr->freq)
 697                 return -EINVAL;
 698
 699         reg = arm_spe_event_to_pmsfcr(event);
 700         if ((reg & BIT(SYS_PMSFCR_EL1_FE_SHIFT)) &&
 701             !(spe_pmu->features & SPE_PMU_FEAT_FILT_EVT))
 702                 return -EOPNOTSUPP;
 703
 704         if ((reg & BIT(SYS_PMSFCR_EL1_FT_SHIFT)) &&
 705             !(spe_pmu->features & SPE_PMU_FEAT_FILT_TYP))
 706                 return -EOPNOTSUPP;
 707
 708         if ((reg & BIT(SYS_PMSFCR_EL1_FL_SHIFT)) &&
 709             !(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
 710                 return -EOPNOTSUPP;
 711
 712         reg = arm_spe_event_to_pmscr(event);
 713         if (!capable(CAP_SYS_ADMIN) &&
 714             (reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) |
 715                     BIT(SYS_PMSCR_EL1_CX_SHIFT) |
 716                     BIT(SYS_PMSCR_EL1_PCT_SHIFT))))
 717                 return -EACCES;
 718
 719         return 0;
 720 }
 721
 722 static void arm_spe_pmu_start(struct perf_event *event, int flags)
 723 {
 724         u64 reg;
 725         struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
 726         struct hw_perf_event *hwc = &event->hw;
 727         struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 728
 729         hwc->state = 0;
 730         arm_spe_perf_aux_output_begin(handle, event);
 731         if (hwc->state)
 732                 return;
 733
 734         reg = arm_spe_event_to_pmsfcr(event);
 735         write_sysreg_s(reg, SYS_PMSFCR_EL1);
 736
 737         reg = arm_spe_event_to_pmsevfr(event);
 738         write_sysreg_s(reg, SYS_PMSEVFR_EL1);
 739
 740         reg = arm_spe_event_to_pmslatfr(event);
 741         write_sysreg_s(reg, SYS_PMSLATFR_EL1);
 742
 743         if (flags & PERF_EF_RELOAD) {
 744                 reg = arm_spe_event_to_pmsirr(event);
 745                 write_sysreg_s(reg, SYS_PMSIRR_EL1);
 746                 isb();
 747                 reg = local64_read(&hwc->period_left);
 748                 write_sysreg_s(reg, SYS_PMSICR_EL1);
 749         }
 750
 751         reg = arm_spe_event_to_pmscr(event);
 752         isb();
 753         write_sysreg_s(reg, SYS_PMSCR_EL1);
 754 }
 755
 756 static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 757 {
 758         struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
 759         struct hw_perf_event *hwc = &event->hw;
 760         struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 761
 762         /* If we're already stopped, then nothing to do */
 763         if (hwc->state & PERF_HES_STOPPED)
 764                 return;
 765
 766         /* Stop all trace generation */
 767         arm_spe_pmu_disable_and_drain_local();
 768
 769         if (flags & PERF_EF_UPDATE) {
 770                 /*
 771                  * If there's a fault pending then ensure we contain it
 772                  * to this buffer, since we might be on the context-switch
 773                  * path.
 774                  */
 775                 if (perf_get_aux(handle)) {
 776                         enum arm_spe_pmu_buf_fault_action act;
 777
 778                         act = arm_spe_pmu_buf_get_fault_act(handle);
 779                         if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS)
 780                                 arm_spe_perf_aux_output_end(handle);
 781                         else
 782                                 write_sysreg_s(0, SYS_PMBSR_EL1);
 783                 }
 784
 785                 /*
 786                  * This may also contain ECOUNT, but nobody else should
 787                  * be looking at period_left, since we forbid frequency
 788                  * based sampling.
 789                  */
 790                 local64_set(&hwc->period_left, read_sysreg_s(SYS_PMSICR_EL1));
 791                 hwc->state |= PERF_HES_UPTODATE;
 792         }
 793
 794         hwc->state |= PERF_HES_STOPPED;
 795 }
 796
 797 static int arm_spe_pmu_add(struct perf_event *event, int flags)
 798 {
 799         int ret = 0;
 800         struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu);
 801         struct hw_perf_event *hwc = &event->hw;
 802         int cpu = event->cpu == -1 ? smp_processor_id() : event->cpu;
 803
 804         if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
 805                 return -ENOENT;
 806
 807         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 808
 809         if (flags & PERF_EF_START) {
 810                 arm_spe_pmu_start(event, PERF_EF_RELOAD);
 811                 if (hwc->state & PERF_HES_STOPPED)
 812                         ret = -EINVAL;
 813         }
 814
 815         return ret;
 816 }
 817
 818 static void arm_spe_pmu_del(struct perf_event *event, int flags)
 819 {
 820         arm_spe_pmu_stop(event, PERF_EF_UPDATE);
 821 }
 822
 823 static void arm_spe_pmu_read(struct perf_event *event)
 824 {
 825 }
 826
 827 static void *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages,
 828                                    int nr_pages, bool snapshot)
 829 {
 830         int i, cpu = event->cpu;
 831         struct page **pglist;
 832         struct arm_spe_pmu_buf *buf;
 833
 834         /* We need at least two pages for this to work. */
 835         if (nr_pages < 2)
 836                 return NULL;
 837
 838         /*
 839          * We require an even number of pages for snapshot mode, so that
 840          * we can effectively treat the buffer as consisting of two equal
 841          * parts and give userspace a fighting chance of getting some
 842          * useful data out of it.
 843          */
 844         if (!nr_pages || (snapshot && (nr_pages & 1)))
 845                 return NULL;
 846
 847         if (cpu == -1)
 848                 cpu = raw_smp_processor_id();
 849
 850         buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu));
 851         if (!buf)
 852                 return NULL;
 853
 854         pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL);
 855         if (!pglist)
 856                 goto out_free_buf;
 857
 858         for (i = 0; i < nr_pages; ++i) {
 859                 struct page *page = virt_to_page(pages[i]);
 860
 861                 if (PagePrivate(page)) {
 862                         pr_warn("unexpected high-order page for auxbuf!");
 863                         goto out_free_pglist;
 864                 }
 865
 866                 pglist[i] = virt_to_page(pages[i]);
 867         }
 868
 869         buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
 870         if (!buf->base)
 871                 goto out_free_pglist;
 872
 873         buf->nr_pages   = nr_pages;
 874         buf->snapshot   = snapshot;
 875
 876         kfree(pglist);
 877         return buf;
 878
 879 out_free_pglist:
 880         kfree(pglist);
 881 out_free_buf:
 882         kfree(buf);
 883         return NULL;
 884 }
 885
 886 static void arm_spe_pmu_free_aux(void *aux)
 887 {
 888         struct arm_spe_pmu_buf *buf = aux;
 889
 890         vunmap(buf->base);
 891         kfree(buf);
 892 }
 893
 894 /* Initialisation and teardown functions */
 895 static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
 896 {
 897         static atomic_t pmu_idx = ATOMIC_INIT(-1);
 898
 899         int idx;
 900         char *name;
 901         struct device *dev = &spe_pmu->pdev->dev;
 902
 903         spe_pmu->pmu = (struct pmu) {
 904                 .module = THIS_MODULE,
 905                 .capabilities   = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
 906                 .attr_groups    = arm_spe_pmu_attr_groups,
 907                 /*
 908                  * We hitch a ride on the software context here, so that
 909                  * we can support per-task profiling (which is not possible
 910                  * with the invalid context as it doesn't get sched callbacks).
 911                  * This requires that userspace either uses a dummy event for
 912                  * perf_event_open, since the aux buffer is not setup until
 913                  * a subsequent mmap, or creates the profiling event in a
 914                  * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it
 915                  * once the buffer has been created.
 916                  */
 917                 .task_ctx_nr    = perf_sw_context,
 918                 .event_init     = arm_spe_pmu_event_init,
 919                 .add            = arm_spe_pmu_add,
 920                 .del            = arm_spe_pmu_del,
 921                 .start          = arm_spe_pmu_start,
 922                 .stop           = arm_spe_pmu_stop,
 923                 .read           = arm_spe_pmu_read,
 924                 .setup_aux      = arm_spe_pmu_setup_aux,
 925                 .free_aux       = arm_spe_pmu_free_aux,
 926         };
 927
 928         idx = atomic_inc_return(&pmu_idx);
 929         name = devm_kasprintf(dev, GFP_KERNEL, "%s_%d", PMUNAME, idx);
 930         if (!name) {
 931                 dev_err(dev, "failed to allocate name for pmu %d\n", idx);
 932                 return -ENOMEM;
 933         }
 934
 935         return perf_pmu_register(&spe_pmu->pmu, name, -1);
 936 }
 937
 938 static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu)
 939 {
 940         perf_pmu_unregister(&spe_pmu->pmu);
 941 }
 942
 943 static void __arm_spe_pmu_dev_probe(void *info)
 944 {
 945         int fld;
 946         u64 reg;
 947         struct arm_spe_pmu *spe_pmu = info;
 948         struct device *dev = &spe_pmu->pdev->dev;
 949
 950         fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
 951                                                    ID_AA64DFR0_PMSVER_SHIFT);
 952         if (!fld) {
 953                 dev_err(dev,
 954                         "unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
 955                         fld, smp_processor_id());
 956                 return;
 957         }
 958
 959         /* Read PMBIDR first to determine whether or not we have access */
 960         reg = read_sysreg_s(SYS_PMBIDR_EL1);
 961         if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) {
 962                 dev_err(dev,
 963                         "profiling buffer owned by higher exception level\n");
 964                 return;
 965         }
 966
 967         /* Minimum alignment. If it's out-of-range, then fail the probe */
 968         fld = reg >> SYS_PMBIDR_EL1_ALIGN_SHIFT & SYS_PMBIDR_EL1_ALIGN_MASK;
 969         spe_pmu->align = 1 << fld;
 970         if (spe_pmu->align > SZ_2K) {
 971                 dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n",
 972                         fld, smp_processor_id());
 973                 return;
 974         }
 975
 976         /* It's now safe to read PMSIDR and figure out what we've got */
 977         reg = read_sysreg_s(SYS_PMSIDR_EL1);
 978         if (reg & BIT(SYS_PMSIDR_EL1_FE_SHIFT))
 979                 spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT;
 980
 981         if (reg & BIT(SYS_PMSIDR_EL1_FT_SHIFT))
 982                 spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP;
 983
 984         if (reg & BIT(SYS_PMSIDR_EL1_FL_SHIFT))
 985                 spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT;
 986
 987         if (reg & BIT(SYS_PMSIDR_EL1_ARCHINST_SHIFT))
 988                 spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST;
 989
 990         if (reg & BIT(SYS_PMSIDR_EL1_LDS_SHIFT))
 991                 spe_pmu->features |= SPE_PMU_FEAT_LDS;
 992
 993         if (reg & BIT(SYS_PMSIDR_EL1_ERND_SHIFT))
 994                 spe_pmu->features |= SPE_PMU_FEAT_ERND;
 995
 996         /* This field has a spaced out encoding, so just use a look-up */
 997         fld = reg >> SYS_PMSIDR_EL1_INTERVAL_SHIFT & SYS_PMSIDR_EL1_INTERVAL_MASK;
 998         switch (fld) {
 999         case 0:
1000                 spe_pmu->min_period = 256;
1001                 break;
1002         case 2:
1003                 spe_pmu->min_period = 512;
1004                 break;
1005         case 3:
1006                 spe_pmu->min_period = 768;
1007                 break;
1008         case 4:
1009                 spe_pmu->min_period = 1024;
1010                 break;
1011         case 5:
1012                 spe_pmu->min_period = 1536;
1013                 break;
1014         case 6:
1015                 spe_pmu->min_period = 2048;
1016                 break;
1017         case 7:
1018                 spe_pmu->min_period = 3072;
1019                 break;
1020         default:
1021                 dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n",
1022                          fld);
1023                 /* Fallthrough */
1024         case 8:
1025                 spe_pmu->min_period = 4096;
1026         }
1027
1028         /* Maximum record size. If it's out-of-range, then fail the probe */
1029         fld = reg >> SYS_PMSIDR_EL1_MAXSIZE_SHIFT & SYS_PMSIDR_EL1_MAXSIZE_MASK;
1030         spe_pmu->max_record_sz = 1 << fld;
1031         if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) {
1032                 dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n",
1033                         fld, smp_processor_id());
1034                 return;
1035         }
1036
1037         fld = reg >> SYS_PMSIDR_EL1_COUNTSIZE_SHIFT & SYS_PMSIDR_EL1_COUNTSIZE_MASK;
1038         switch (fld) {
1039         default:
1040                 dev_warn(dev, "unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n",
1041                          fld);
1042                 /* Fallthrough */
1043         case 2:
1044                 spe_pmu->counter_sz = 12;
1045         }
1046
1047         dev_info(dev,
1048                  "probed for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n",
1049                  cpumask_pr_args(&spe_pmu->supported_cpus),
1050                  spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features);
1051
1052         spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED;
1053         return;
1054 }
1055
1056 static void __arm_spe_pmu_reset_local(void)
1057 {
1058         /*
1059          * This is probably overkill, as we have no idea where we're
1060          * draining any buffered data to...
1061          */
1062         arm_spe_pmu_disable_and_drain_local();
1063
1064         /* Reset the buffer base pointer */
1065         write_sysreg_s(0, SYS_PMBPTR_EL1);
1066         isb();
1067
1068         /* Clear any pending management interrupts */
1069         write_sysreg_s(0, SYS_PMBSR_EL1);
1070         isb();
1071 }
1072
1073 static void __arm_spe_pmu_setup_one(void *info)
1074 {
1075         struct arm_spe_pmu *spe_pmu = info;
1076
1077         __arm_spe_pmu_reset_local();
1078         enable_percpu_irq(spe_pmu->irq, IRQ_TYPE_NONE);
1079 }
1080
1081 static void __arm_spe_pmu_stop_one(void *info)
1082 {
1083         struct arm_spe_pmu *spe_pmu = info;
1084
1085         disable_percpu_irq(spe_pmu->irq);
1086         __arm_spe_pmu_reset_local();
1087 }
1088
1089 static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node)
1090 {
1091         struct arm_spe_pmu *spe_pmu;
1092
1093         spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
1094         if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
1095                 return 0;
1096
1097         __arm_spe_pmu_setup_one(spe_pmu);
1098         return 0;
1099 }
1100
1101 static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
1102 {
1103         struct arm_spe_pmu *spe_pmu;
1104
1105         spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
1106         if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
1107                 return 0;
1108
1109         __arm_spe_pmu_stop_one(spe_pmu);
1110         return 0;
1111 }
1112
1113 static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu)
1114 {
1115         int ret;
1116         cpumask_t *mask = &spe_pmu->supported_cpus;
1117
1118         /* Make sure we probe the hardware on a relevant CPU */
1119         ret = smp_call_function_any(mask,  __arm_spe_pmu_dev_probe, spe_pmu, 1);
1120         if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED))
1121                 return -ENXIO;
1122
1123         /* Request our PPIs (note that the IRQ is still disabled) */
1124         ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME,
1125                                  spe_pmu->handle);
1126         if (ret)
1127                 return ret;
1128
1129         /*
1130          * Register our hotplug notifier now so we don't miss any events.
1131          * This will enable the IRQ for any supported CPUs that are already
1132          * up.
1133          */
1134         ret = cpuhp_state_add_instance(arm_spe_pmu_online,
1135                                        &spe_pmu->hotplug_node);
1136         if (ret)
1137                 free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
1138
1139         return ret;
1140 }
1141
1142 static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu)
1143 {
1144         cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node);
1145         free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
1146 }
1147
1148 /* Driver and device probing */
1149 static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu)
1150 {
1151         struct platform_device *pdev = spe_pmu->pdev;
1152         int irq = platform_get_irq(pdev, 0);
1153
1154         if (irq < 0) {
1155                 dev_err(&pdev->dev, "failed to get IRQ (%d)\n", irq);
1156                 return -ENXIO;
1157         }
1158
1159         if (!irq_is_percpu(irq)) {
1160                 dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq);
1161                 return -EINVAL;
1162         }
1163
1164         if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) {
1165                 dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq);
1166                 return -EINVAL;
1167         }
1168
1169         spe_pmu->irq = irq;
1170         return 0;
1171 }
1172
1173 static const struct of_device_id arm_spe_pmu_of_match[] = {
1174         { .compatible = "arm,statistical-profiling-extension-v1", .data = (void *)1 },
1175         { /* Sentinel */ },
1176 };
1177
1178 static int arm_spe_pmu_device_dt_probe(struct platform_device *pdev)
1179 {
1180         int ret;
1181         struct arm_spe_pmu *spe_pmu;
1182         struct device *dev = &pdev->dev;
1183
1184         /*
1185          * If kernelspace is unmapped when running at EL0, then the SPE
1186          * buffer will fault and prematurely terminate the AUX session.
1187          */
1188         if (arm64_kernel_unmapped_at_el0()) {
1189                 dev_warn_once(dev, "profiling buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n");
1190                 return -EPERM;
1191         }
1192
1193         spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL);
1194         if (!spe_pmu) {
1195                 dev_err(dev, "failed to allocate spe_pmu\n");
1196                 return -ENOMEM;
1197         }
1198
1199         spe_pmu->handle = alloc_percpu(typeof(*spe_pmu->handle));
1200         if (!spe_pmu->handle)
1201                 return -ENOMEM;
1202
1203         spe_pmu->pdev = pdev;
1204         platform_set_drvdata(pdev, spe_pmu);
1205
1206         ret = arm_spe_pmu_irq_probe(spe_pmu);
1207         if (ret)
1208                 goto out_free_handle;
1209
1210         ret = arm_spe_pmu_dev_init(spe_pmu);
1211         if (ret)
1212                 goto out_free_handle;
1213
1214         ret = arm_spe_pmu_perf_init(spe_pmu);
1215         if (ret)
1216                 goto out_teardown_dev;
1217
1218         return 0;
1219
1220 out_teardown_dev:
1221         arm_spe_pmu_dev_teardown(spe_pmu);
1222 out_free_handle:
1223         free_percpu(spe_pmu->handle);
1224         return ret;
1225 }
1226
1227 static int arm_spe_pmu_device_remove(struct platform_device *pdev)
1228 {
1229         struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
1230
1231         arm_spe_pmu_perf_destroy(spe_pmu);
1232         arm_spe_pmu_dev_teardown(spe_pmu);
1233         free_percpu(spe_pmu->handle);
1234         return 0;
1235 }
1236
1237 static struct platform_driver arm_spe_pmu_driver = {
1238         .driver = {
1239                 .name           = DRVNAME,
1240                 .of_match_table = of_match_ptr(arm_spe_pmu_of_match),
1241         },
1242         .probe  = arm_spe_pmu_device_dt_probe,
1243         .remove = arm_spe_pmu_device_remove,
1244 };
1245
1246 static int __init arm_spe_pmu_init(void)
1247 {
1248         int ret;
1249
1250         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
1251                                       arm_spe_pmu_cpu_startup,
1252                                       arm_spe_pmu_cpu_teardown);
1253         if (ret < 0)
1254                 return ret;
1255         arm_spe_pmu_online = ret;
1256
1257         ret = platform_driver_register(&arm_spe_pmu_driver);
1258         if (ret)
1259                 cpuhp_remove_multi_state(arm_spe_pmu_online);
1260
1261         return ret;
1262 }
1263
1264 static void __exit arm_spe_pmu_exit(void)
1265 {
1266         platform_driver_unregister(&arm_spe_pmu_driver);
1267         cpuhp_remove_multi_state(arm_spe_pmu_online);
1268 }
1269
1270 module_init(arm_spe_pmu_init);
1271 module_exit(arm_spe_pmu_exit);
1272
1273 MODULE_DESCRIPTION("Perf driver for the ARMv8.2 Statistical Profiling Extension");
1274 MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
1275 MODULE_LICENSE("GPL v2");