arch/powerpc/perf/hv-24x7.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Hypervisor supplied "24x7" performance counter support
   4  *
   5  * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
   6  * Copyright 2014 IBM Corporation.
   7  */
   8
   9 #define pr_fmt(fmt) "hv-24x7: " fmt
  10
  11 #include <linux/perf_event.h>
  12 #include <linux/rbtree.h>
  13 #include <linux/module.h>
  14 #include <linux/slab.h>
  15 #include <linux/vmalloc.h>
  16
  17 #include <asm/cputhreads.h>
  18 #include <asm/firmware.h>
  19 #include <asm/hvcall.h>
  20 #include <asm/io.h>
  21 #include <linux/byteorder/generic.h>
  22
  23 #include "hv-24x7.h"
  24 #include "hv-24x7-catalog.h"
  25 #include "hv-common.h"
  26
  27 /* Version of the 24x7 hypervisor API that we should use in this machine. */
  28 static int interface_version;
  29
  30 /* Whether we have to aggregate result data for some domains. */
  31 static bool aggregate_result_elements;
  32
  33 static bool domain_is_valid(unsigned domain)
  34 {
  35         switch (domain) {
  36 #define DOMAIN(n, v, x, c)              \
  37         case HV_PERF_DOMAIN_##n:        \
  38                 /* fall through */
  39 #include "hv-24x7-domains.h"
  40 #undef DOMAIN
  41                 return true;
  42         default:
  43                 return false;
  44         }
  45 }
  46
  47 static bool is_physical_domain(unsigned domain)
  48 {
  49         switch (domain) {
  50 #define DOMAIN(n, v, x, c)              \
  51         case HV_PERF_DOMAIN_##n:        \
  52                 return c;
  53 #include "hv-24x7-domains.h"
  54 #undef DOMAIN
  55         default:
  56                 return false;
  57         }
  58 }
  59
  60 /* Domains for which more than one result element are returned for each event. */
  61 static bool domain_needs_aggregation(unsigned int domain)
  62 {
  63         return aggregate_result_elements &&
  64                         (domain == HV_PERF_DOMAIN_PHYS_CORE ||
  65                          (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
  66                           domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
  67 }
  68
  69 static const char *domain_name(unsigned domain)
  70 {
  71         if (!domain_is_valid(domain))
  72                 return NULL;
  73
  74         switch (domain) {
  75         case HV_PERF_DOMAIN_PHYS_CHIP:          return "Physical Chip";
  76         case HV_PERF_DOMAIN_PHYS_CORE:          return "Physical Core";
  77         case HV_PERF_DOMAIN_VCPU_HOME_CORE:     return "VCPU Home Core";
  78         case HV_PERF_DOMAIN_VCPU_HOME_CHIP:     return "VCPU Home Chip";
  79         case HV_PERF_DOMAIN_VCPU_HOME_NODE:     return "VCPU Home Node";
  80         case HV_PERF_DOMAIN_VCPU_REMOTE_NODE:   return "VCPU Remote Node";
  81         }
  82
  83         WARN_ON_ONCE(domain);
  84         return NULL;
  85 }
  86
  87 static bool catalog_entry_domain_is_valid(unsigned domain)
  88 {
  89         /* POWER8 doesn't support virtual domains. */
  90         if (interface_version == 1)
  91                 return is_physical_domain(domain);
  92         else
  93                 return domain_is_valid(domain);
  94 }
  95
  96 /*
  97  * TODO: Merging events:
  98  * - Think of the hcall as an interface to a 4d array of counters:
  99  *   - x = domains
 100  *   - y = indexes in the domain (core, chip, vcpu, node, etc)
 101  *   - z = offset into the counter space
 102  *   - w = lpars (guest vms, "logical partitions")
 103  * - A single request is: x,y,y_last,z,z_last,w,w_last
 104  *   - this means we can retrieve a rectangle of counters in y,z for a single x.
 105  *
 106  * - Things to consider (ignoring w):
 107  *   - input  cost_per_request = 16
 108  *   - output cost_per_result(ys,zs)  = 8 + 8 * ys + ys * zs
 109  *   - limited number of requests per hcall (must fit into 4K bytes)
 110  *     - 4k = 16 [buffer header] - 16 [request size] * request_count
 111  *     - 255 requests per hcall
 112  *   - sometimes it will be more efficient to read extra data and discard
 113  */
 114
 115 /*
 116  * Example usage:
 117  *  perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
 118  */
 119
 120 /* u3 0-6, one of HV_24X7_PERF_DOMAIN */
 121 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
 122 /* u16 */
 123 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
 124 EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
 125 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
 126 /* u32, see "data_offset" */
 127 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
 128 /* u16 */
 129 EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
 130
 131 EVENT_DEFINE_RANGE(reserved1, config,   4, 15);
 132 EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
 133 EVENT_DEFINE_RANGE(reserved3, config2,  0, 63);
 134
 135 static struct attribute *format_attrs[] = {
 136         &format_attr_domain.attr,
 137         &format_attr_offset.attr,
 138         &format_attr_core.attr,
 139         &format_attr_chip.attr,
 140         &format_attr_vcpu.attr,
 141         &format_attr_lpar.attr,
 142         NULL,
 143 };
 144
 145 static struct attribute_group format_group = {
 146         .name = "format",
 147         .attrs = format_attrs,
 148 };
 149
 150 static struct attribute_group event_group = {
 151         .name = "events",
 152         /* .attrs is set in init */
 153 };
 154
 155 static struct attribute_group event_desc_group = {
 156         .name = "event_descs",
 157         /* .attrs is set in init */
 158 };
 159
 160 static struct attribute_group event_long_desc_group = {
 161         .name = "event_long_descs",
 162         /* .attrs is set in init */
 163 };
 164
 165 static struct kmem_cache *hv_page_cache;
 166
 167 DEFINE_PER_CPU(int, hv_24x7_txn_flags);
 168 DEFINE_PER_CPU(int, hv_24x7_txn_err);
 169
 170 struct hv_24x7_hw {
 171         struct perf_event *events[255];
 172 };
 173
 174 DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
 175
 176 /*
 177  * request_buffer and result_buffer are not required to be 4k aligned,
 178  * but are not allowed to cross any 4k boundary. Aligning them to 4k is
 179  * the simplest way to ensure that.
 180  */
 181 #define H24x7_DATA_BUFFER_SIZE  4096
 182 DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
 183 DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
 184
 185 static unsigned int max_num_requests(int interface_version)
 186 {
 187         return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
 188                 / H24x7_REQUEST_SIZE(interface_version);
 189 }
 190
 191 static char *event_name(struct hv_24x7_event_data *ev, int *len)
 192 {
 193         *len = be16_to_cpu(ev->event_name_len) - 2;
 194         return (char *)ev->remainder;
 195 }
 196
 197 static char *event_desc(struct hv_24x7_event_data *ev, int *len)
 198 {
 199         unsigned nl = be16_to_cpu(ev->event_name_len);
 200         __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
 201
 202         *len = be16_to_cpu(*desc_len) - 2;
 203         return (char *)ev->remainder + nl;
 204 }
 205
 206 static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
 207 {
 208         unsigned nl = be16_to_cpu(ev->event_name_len);
 209         __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
 210         unsigned desc_len = be16_to_cpu(*desc_len_);
 211         __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
 212
 213         *len = be16_to_cpu(*long_desc_len) - 2;
 214         return (char *)ev->remainder + nl + desc_len;
 215 }
 216
 217 static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
 218                                           void *end)
 219 {
 220         void *start = ev;
 221
 222         return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
 223 }
 224
 225 /*
 226  * Things we don't check:
 227  *  - padding for desc, name, and long/detailed desc is required to be '\0'
 228  *    bytes.
 229  *
 230  *  Return NULL if we pass end,
 231  *  Otherwise return the address of the byte just following the event.
 232  */
 233 static void *event_end(struct hv_24x7_event_data *ev, void *end)
 234 {
 235         void *start = ev;
 236         __be16 *dl_, *ldl_;
 237         unsigned dl, ldl;
 238         unsigned nl = be16_to_cpu(ev->event_name_len);
 239
 240         if (nl < 2) {
 241                 pr_debug("%s: name length too short: %d", __func__, nl);
 242                 return NULL;
 243         }
 244
 245         if (start + nl > end) {
 246                 pr_debug("%s: start=%p + nl=%u > end=%p",
 247                                 __func__, start, nl, end);
 248                 return NULL;
 249         }
 250
 251         dl_ = (__be16 *)(ev->remainder + nl - 2);
 252         if (!IS_ALIGNED((uintptr_t)dl_, 2))
 253                 pr_warn("desc len not aligned %p", dl_);
 254         dl = be16_to_cpu(*dl_);
 255         if (dl < 2) {
 256                 pr_debug("%s: desc len too short: %d", __func__, dl);
 257                 return NULL;
 258         }
 259
 260         if (start + nl + dl > end) {
 261                 pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
 262                                 __func__, start, nl, dl, start + nl + dl, end);
 263                 return NULL;
 264         }
 265
 266         ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
 267         if (!IS_ALIGNED((uintptr_t)ldl_, 2))
 268                 pr_warn("long desc len not aligned %p", ldl_);
 269         ldl = be16_to_cpu(*ldl_);
 270         if (ldl < 2) {
 271                 pr_debug("%s: long desc len too short (ldl=%u)",
 272                                 __func__, ldl);
 273                 return NULL;
 274         }
 275
 276         if (start + nl + dl + ldl > end) {
 277                 pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
 278                                 __func__, start, nl, dl, ldl, end);
 279                 return NULL;
 280         }
 281
 282         return start + nl + dl + ldl;
 283 }
 284
 285 static long h_get_24x7_catalog_page_(unsigned long phys_4096,
 286                                      unsigned long version, unsigned long index)
 287 {
 288         pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
 289                         phys_4096, version, index);
 290
 291         WARN_ON(!IS_ALIGNED(phys_4096, 4096));
 292
 293         return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
 294                         phys_4096, version, index);
 295 }
 296
 297 static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
 298 {
 299         return h_get_24x7_catalog_page_(virt_to_phys(page),
 300                                         version, index);
 301 }
 302
 303 /*
 304  * Each event we find in the catalog, will have a sysfs entry. Format the
 305  * data for this sysfs entry based on the event's domain.
 306  *
 307  * Events belonging to the Chip domain can only be monitored in that domain.
 308  * i.e the domain for these events is a fixed/knwon value.
 309  *
 310  * Events belonging to the Core domain can be monitored either in the physical
 311  * core or in one of the virtual CPU domains. So the domain value for these
 312  * events must be specified by the user (i.e is a required parameter). Format
 313  * the Core events with 'domain=?' so the perf-tool can error check required
 314  * parameters.
 315  *
 316  * NOTE: For the Core domain events, rather than making domain a required
 317  *       parameter we could default it to PHYS_CORE and allowe users to
 318  *       override the domain to one of the VCPU domains.
 319  *
 320  *       However, this can make the interface a little inconsistent.
 321  *
 322  *       If we set domain=2 (PHYS_CHIP) and allow user to override this field
 323  *       the user may be tempted to also modify the "offset=x" field in which
 324  *       can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
 325  *       HPM_INST (offset=0x20) events. With:
 326  *
 327  *              perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
 328  *
 329  *      we end up monitoring HPM_INST, while the command line has HPM_PCYC.
 330  *
 331  *      By not assigning a default value to the domain for the Core events,
 332  *      we can have simple guidelines:
 333  *
 334  *              - Specifying values for parameters with "=?" is required.
 335  *
 336  *              - Specifying (i.e overriding) values for other parameters
 337  *                is undefined.
 338  */
 339 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
 340 {
 341         const char *sindex;
 342         const char *lpar;
 343         const char *domain_str;
 344         char buf[8];
 345
 346         switch (domain) {
 347         case HV_PERF_DOMAIN_PHYS_CHIP:
 348                 snprintf(buf, sizeof(buf), "%d", domain);
 349                 domain_str = buf;
 350                 lpar = "0x0";
 351                 sindex = "chip";
 352                 break;
 353         case HV_PERF_DOMAIN_PHYS_CORE:
 354                 domain_str = "?";
 355                 lpar = "0x0";
 356                 sindex = "core";
 357                 break;
 358         default:
 359                 domain_str = "?";
 360                 lpar = "?";
 361                 sindex = "vcpu";
 362         }
 363
 364         return kasprintf(GFP_KERNEL,
 365                         "domain=%s,offset=0x%x,%s=?,lpar=%s",
 366                         domain_str,
 367                         be16_to_cpu(event->event_counter_offs) +
 368                                 be16_to_cpu(event->event_group_record_offs),
 369                         sindex,
 370                         lpar);
 371 }
 372
 373 /* Avoid trusting fw to NUL terminate strings */
 374 static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
 375 {
 376         return kasprintf(gfp, "%.*s", max_len, maybe_str);
 377 }
 378
 379 static ssize_t device_show_string(struct device *dev,
 380                 struct device_attribute *attr, char *buf)
 381 {
 382         struct dev_ext_attribute *d;
 383
 384         d = container_of(attr, struct dev_ext_attribute, attr);
 385
 386         return sprintf(buf, "%s\n", (char *)d->var);
 387 }
 388
 389 static struct attribute *device_str_attr_create_(char *name, char *str)
 390 {
 391         struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
 392
 393         if (!attr)
 394                 return NULL;
 395
 396         sysfs_attr_init(&attr->attr.attr);
 397
 398         attr->var = str;
 399         attr->attr.attr.name = name;
 400         attr->attr.attr.mode = 0444;
 401         attr->attr.show = device_show_string;
 402
 403         return &attr->attr.attr;
 404 }
 405
 406 /*
 407  * Allocate and initialize strings representing event attributes.
 408  *
 409  * NOTE: The strings allocated here are never destroyed and continue to
 410  *       exist till shutdown. This is to allow us to create as many events
 411  *       from the catalog as possible, even if we encounter errors with some.
 412  *       In case of changes to error paths in future, these may need to be
 413  *       freed by the caller.
 414  */
 415 static struct attribute *device_str_attr_create(char *name, int name_max,
 416                                                 int name_nonce,
 417                                                 char *str, size_t str_max)
 418 {
 419         char *n;
 420         char *s = memdup_to_str(str, str_max, GFP_KERNEL);
 421         struct attribute *a;
 422
 423         if (!s)
 424                 return NULL;
 425
 426         if (!name_nonce)
 427                 n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
 428         else
 429                 n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
 430                                         name_nonce);
 431         if (!n)
 432                 goto out_s;
 433
 434         a = device_str_attr_create_(n, s);
 435         if (!a)
 436                 goto out_n;
 437
 438         return a;
 439 out_n:
 440         kfree(n);
 441 out_s:
 442         kfree(s);
 443         return NULL;
 444 }
 445
 446 static struct attribute *event_to_attr(unsigned ix,
 447                                        struct hv_24x7_event_data *event,
 448                                        unsigned domain,
 449                                        int nonce)
 450 {
 451         int event_name_len;
 452         char *ev_name, *a_ev_name, *val;
 453         struct attribute *attr;
 454
 455         if (!domain_is_valid(domain)) {
 456                 pr_warn("catalog event %u has invalid domain %u\n",
 457                                 ix, domain);
 458                 return NULL;
 459         }
 460
 461         val = event_fmt(event, domain);
 462         if (!val)
 463                 return NULL;
 464
 465         ev_name = event_name(event, &event_name_len);
 466         if (!nonce)
 467                 a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
 468                                 (int)event_name_len, ev_name);
 469         else
 470                 a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
 471                                 (int)event_name_len, ev_name, nonce);
 472
 473         if (!a_ev_name)
 474                 goto out_val;
 475
 476         attr = device_str_attr_create_(a_ev_name, val);
 477         if (!attr)
 478                 goto out_name;
 479
 480         return attr;
 481 out_name:
 482         kfree(a_ev_name);
 483 out_val:
 484         kfree(val);
 485         return NULL;
 486 }
 487
 488 static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
 489                                             int nonce)
 490 {
 491         int nl, dl;
 492         char *name = event_name(event, &nl);
 493         char *desc = event_desc(event, &dl);
 494
 495         /* If there isn't a description, don't create the sysfs file */
 496         if (!dl)
 497                 return NULL;
 498
 499         return device_str_attr_create(name, nl, nonce, desc, dl);
 500 }
 501
 502 static struct attribute *
 503 event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
 504 {
 505         int nl, dl;
 506         char *name = event_name(event, &nl);
 507         char *desc = event_long_desc(event, &dl);
 508
 509         /* If there isn't a description, don't create the sysfs file */
 510         if (!dl)
 511                 return NULL;
 512
 513         return device_str_attr_create(name, nl, nonce, desc, dl);
 514 }
 515
 516 static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
 517                                    struct hv_24x7_event_data *event, int nonce)
 518 {
 519         *attrs = event_to_attr(ix, event, event->domain, nonce);
 520         if (!*attrs)
 521                 return -1;
 522
 523         return 0;
 524 }
 525
 526 /* */
 527 struct event_uniq {
 528         struct rb_node node;
 529         const char *name;
 530         int nl;
 531         unsigned ct;
 532         unsigned domain;
 533 };
 534
 535 static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
 536 {
 537         if (s1 < s2)
 538                 return 1;
 539         if (s1 > s2)
 540                 return -1;
 541
 542         return memcmp(d1, d2, s1);
 543 }
 544
 545 static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
 546                        size_t s2, unsigned d2)
 547 {
 548         int r = memord(v1, s1, v2, s2);
 549
 550         if (r)
 551                 return r;
 552         if (d1 > d2)
 553                 return 1;
 554         if (d2 > d1)
 555                 return -1;
 556         return 0;
 557 }
 558
 559 static int event_uniq_add(struct rb_root *root, const char *name, int nl,
 560                           unsigned domain)
 561 {
 562         struct rb_node **new = &(root->rb_node), *parent = NULL;
 563         struct event_uniq *data;
 564
 565         /* Figure out where to put new node */
 566         while (*new) {
 567                 struct event_uniq *it;
 568                 int result;
 569
 570                 it = rb_entry(*new, struct event_uniq, node);
 571                 result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
 572                                         it->domain);
 573
 574                 parent = *new;
 575                 if (result < 0)
 576                         new = &((*new)->rb_left);
 577                 else if (result > 0)
 578                         new = &((*new)->rb_right);
 579                 else {
 580                         it->ct++;
 581                         pr_info("found a duplicate event %.*s, ct=%u\n", nl,
 582                                                 name, it->ct);
 583                         return it->ct;
 584                 }
 585         }
 586
 587         data = kmalloc(sizeof(*data), GFP_KERNEL);
 588         if (!data)
 589                 return -ENOMEM;
 590
 591         *data = (struct event_uniq) {
 592                 .name = name,
 593                 .nl = nl,
 594                 .ct = 0,
 595                 .domain = domain,
 596         };
 597
 598         /* Add new node and rebalance tree. */
 599         rb_link_node(&data->node, parent, new);
 600         rb_insert_color(&data->node, root);
 601
 602         /* data->ct */
 603         return 0;
 604 }
 605
 606 static void event_uniq_destroy(struct rb_root *root)
 607 {
 608         /*
 609          * the strings we point to are in the giant block of memory filled by
 610          * the catalog, and are freed separately.
 611          */
 612         struct event_uniq *pos, *n;
 613
 614         rbtree_postorder_for_each_entry_safe(pos, n, root, node)
 615                 kfree(pos);
 616 }
 617
 618
 619 /*
 620  * ensure the event structure's sizes are self consistent and don't cause us to
 621  * read outside of the event
 622  *
 623  * On success, return the event length in bytes.
 624  * Otherwise, return -1 (and print as appropriate).
 625  */
 626 static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
 627                                           size_t event_idx,
 628                                           size_t event_data_bytes,
 629                                           size_t event_entry_count,
 630                                           size_t offset, void *end)
 631 {
 632         ssize_t ev_len;
 633         void *ev_end, *calc_ev_end;
 634
 635         if (offset >= event_data_bytes)
 636                 return -1;
 637
 638         if (event_idx >= event_entry_count) {
 639                 pr_devel("catalog event data has %zu bytes of padding after last event\n",
 640                                 event_data_bytes - offset);
 641                 return -1;
 642         }
 643
 644         if (!event_fixed_portion_is_within(event, end)) {
 645                 pr_warn("event %zu fixed portion is not within range\n",
 646                                 event_idx);
 647                 return -1;
 648         }
 649
 650         ev_len = be16_to_cpu(event->length);
 651
 652         if (ev_len % 16)
 653                 pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
 654                                 event_idx, ev_len, event);
 655
 656         ev_end = (__u8 *)event + ev_len;
 657         if (ev_end > end) {
 658                 pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
 659                                 event_idx, ev_len, ev_end, end,
 660                                 offset);
 661                 return -1;
 662         }
 663
 664         calc_ev_end = event_end(event, end);
 665         if (!calc_ev_end) {
 666                 pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
 667                         event_idx, event_data_bytes, event, end,
 668                         offset);
 669                 return -1;
 670         }
 671
 672         if (calc_ev_end > ev_end) {
 673                 pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
 674                         event_idx, event, ev_end, offset, calc_ev_end);
 675                 return -1;
 676         }
 677
 678         return ev_len;
 679 }
 680
 681 #define MAX_4K (SIZE_MAX / 4096)
 682
 683 static int create_events_from_catalog(struct attribute ***events_,
 684                                       struct attribute ***event_descs_,
 685                                       struct attribute ***event_long_descs_)
 686 {
 687         long hret;
 688         size_t catalog_len, catalog_page_len, event_entry_count,
 689                event_data_len, event_data_offs,
 690                event_data_bytes, junk_events, event_idx, event_attr_ct, i,
 691                attr_max, event_idx_last, desc_ct, long_desc_ct;
 692         ssize_t ct, ev_len;
 693         uint64_t catalog_version_num;
 694         struct attribute **events, **event_descs, **event_long_descs;
 695         struct hv_24x7_catalog_page_0 *page_0 =
 696                 kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
 697         void *page = page_0;
 698         void *event_data, *end;
 699         struct hv_24x7_event_data *event;
 700         struct rb_root ev_uniq = RB_ROOT;
 701         int ret = 0;
 702
 703         if (!page) {
 704                 ret = -ENOMEM;
 705                 goto e_out;
 706         }
 707
 708         hret = h_get_24x7_catalog_page(page, 0, 0);
 709         if (hret) {
 710                 ret = -EIO;
 711                 goto e_free;
 712         }
 713
 714         catalog_version_num = be64_to_cpu(page_0->version);
 715         catalog_page_len = be32_to_cpu(page_0->length);
 716
 717         if (MAX_4K < catalog_page_len) {
 718                 pr_err("invalid page count: %zu\n", catalog_page_len);
 719                 ret = -EIO;
 720                 goto e_free;
 721         }
 722
 723         catalog_len = catalog_page_len * 4096;
 724
 725         event_entry_count = be16_to_cpu(page_0->event_entry_count);
 726         event_data_offs   = be16_to_cpu(page_0->event_data_offs);
 727         event_data_len    = be16_to_cpu(page_0->event_data_len);
 728
 729         pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
 730                         catalog_version_num, catalog_len,
 731                         event_entry_count, event_data_offs, event_data_len);
 732
 733         if ((MAX_4K < event_data_len)
 734                         || (MAX_4K < event_data_offs)
 735                         || (MAX_4K - event_data_offs < event_data_len)) {
 736                 pr_err("invalid event data offs %zu and/or len %zu\n",
 737                                 event_data_offs, event_data_len);
 738                 ret = -EIO;
 739                 goto e_free;
 740         }
 741
 742         if ((event_data_offs + event_data_len) > catalog_page_len) {
 743                 pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
 744                                 event_data_offs,
 745                                 event_data_offs + event_data_len,
 746                                 catalog_page_len);
 747                 ret = -EIO;
 748                 goto e_free;
 749         }
 750
 751         if (SIZE_MAX - 1 < event_entry_count) {
 752                 pr_err("event_entry_count %zu is invalid\n", event_entry_count);
 753                 ret = -EIO;
 754                 goto e_free;
 755         }
 756
 757         event_data_bytes = event_data_len * 4096;
 758
 759         /*
 760          * event data can span several pages, events can cross between these
 761          * pages. Use vmalloc to make this easier.
 762          */
 763         event_data = vmalloc(event_data_bytes);
 764         if (!event_data) {
 765                 pr_err("could not allocate event data\n");
 766                 ret = -ENOMEM;
 767                 goto e_free;
 768         }
 769
 770         end = event_data + event_data_bytes;
 771
 772         /*
 773          * using vmalloc_to_phys() like this only works if PAGE_SIZE is
 774          * divisible by 4096
 775          */
 776         BUILD_BUG_ON(PAGE_SIZE % 4096);
 777
 778         for (i = 0; i < event_data_len; i++) {
 779                 hret = h_get_24x7_catalog_page_(
 780                                 vmalloc_to_phys(event_data + i * 4096),
 781                                 catalog_version_num,
 782                                 i + event_data_offs);
 783                 if (hret) {
 784                         pr_err("Failed to get event data in page %zu: rc=%ld\n",
 785                                i + event_data_offs, hret);
 786                         ret = -EIO;
 787                         goto e_event_data;
 788                 }
 789         }
 790
 791         /*
 792          * scan the catalog to determine the number of attributes we need, and
 793          * verify it at the same time.
 794          */
 795         for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
 796              ;
 797              event_idx++, event = (void *)event + ev_len) {
 798                 size_t offset = (void *)event - (void *)event_data;
 799                 char *name;
 800                 int nl;
 801
 802                 ev_len = catalog_event_len_validate(event, event_idx,
 803                                                     event_data_bytes,
 804                                                     event_entry_count,
 805                                                     offset, end);
 806                 if (ev_len < 0)
 807                         break;
 808
 809                 name = event_name(event, &nl);
 810
 811                 if (event->event_group_record_len == 0) {
 812                         pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
 813                                         event_idx, nl, name);
 814                         junk_events++;
 815                         continue;
 816                 }
 817
 818                 if (!catalog_entry_domain_is_valid(event->domain)) {
 819                         pr_info("event %zu (%.*s) has invalid domain %d\n",
 820                                         event_idx, nl, name, event->domain);
 821                         junk_events++;
 822                         continue;
 823                 }
 824
 825                 attr_max++;
 826         }
 827
 828         event_idx_last = event_idx;
 829         if (event_idx_last != event_entry_count)
 830                 pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
 831                                 event_idx_last, event_entry_count, junk_events);
 832
 833         events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
 834         if (!events) {
 835                 ret = -ENOMEM;
 836                 goto e_event_data;
 837         }
 838
 839         event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
 840                                 GFP_KERNEL);
 841         if (!event_descs) {
 842                 ret = -ENOMEM;
 843                 goto e_event_attrs;
 844         }
 845
 846         event_long_descs = kmalloc_array(event_idx + 1,
 847                         sizeof(*event_long_descs), GFP_KERNEL);
 848         if (!event_long_descs) {
 849                 ret = -ENOMEM;
 850                 goto e_event_descs;
 851         }
 852
 853         /* Iterate over the catalog filling in the attribute vector */
 854         for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
 855                                 event = event_data, event_idx = 0;
 856                         event_idx < event_idx_last;
 857                         event_idx++, ev_len = be16_to_cpu(event->length),
 858                                 event = (void *)event + ev_len) {
 859                 char *name;
 860                 int nl;
 861                 int nonce;
 862                 /*
 863                  * these are the only "bad" events that are intermixed and that
 864                  * we can ignore without issue. make sure to skip them here
 865                  */
 866                 if (event->event_group_record_len == 0)
 867                         continue;
 868                 if (!catalog_entry_domain_is_valid(event->domain))
 869                         continue;
 870
 871                 name  = event_name(event, &nl);
 872                 nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
 873                 ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
 874                                             event, nonce);
 875                 if (ct < 0) {
 876                         pr_warn("event %zu (%.*s) creation failure, skipping\n",
 877                                 event_idx, nl, name);
 878                         junk_events++;
 879                 } else {
 880                         event_attr_ct++;
 881                         event_descs[desc_ct] = event_to_desc_attr(event, nonce);
 882                         if (event_descs[desc_ct])
 883                                 desc_ct++;
 884                         event_long_descs[long_desc_ct] =
 885                                         event_to_long_desc_attr(event, nonce);
 886                         if (event_long_descs[long_desc_ct])
 887                                 long_desc_ct++;
 888                 }
 889         }
 890
 891         pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
 892                         event_idx, event_attr_ct, junk_events, desc_ct);
 893
 894         events[event_attr_ct] = NULL;
 895         event_descs[desc_ct] = NULL;
 896         event_long_descs[long_desc_ct] = NULL;
 897
 898         event_uniq_destroy(&ev_uniq);
 899         vfree(event_data);
 900         kmem_cache_free(hv_page_cache, page);
 901
 902         *events_ = events;
 903         *event_descs_ = event_descs;
 904         *event_long_descs_ = event_long_descs;
 905         return 0;
 906
 907 e_event_descs:
 908         kfree(event_descs);
 909 e_event_attrs:
 910         kfree(events);
 911 e_event_data:
 912         vfree(event_data);
 913 e_free:
 914         kmem_cache_free(hv_page_cache, page);
 915 e_out:
 916         *events_ = NULL;
 917         *event_descs_ = NULL;
 918         *event_long_descs_ = NULL;
 919         return ret;
 920 }
 921
 922 static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
 923                             struct bin_attribute *bin_attr, char *buf,
 924                             loff_t offset, size_t count)
 925 {
 926         long hret;
 927         ssize_t ret = 0;
 928         size_t catalog_len = 0, catalog_page_len = 0;
 929         loff_t page_offset = 0;
 930         loff_t offset_in_page;
 931         size_t copy_len;
 932         uint64_t catalog_version_num = 0;
 933         void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
 934         struct hv_24x7_catalog_page_0 *page_0 = page;
 935
 936         if (!page)
 937                 return -ENOMEM;
 938
 939         hret = h_get_24x7_catalog_page(page, 0, 0);
 940         if (hret) {
 941                 ret = -EIO;
 942                 goto e_free;
 943         }
 944
 945         catalog_version_num = be64_to_cpu(page_0->version);
 946         catalog_page_len = be32_to_cpu(page_0->length);
 947         catalog_len = catalog_page_len * 4096;
 948
 949         page_offset = offset / 4096;
 950         offset_in_page = offset % 4096;
 951
 952         if (page_offset >= catalog_page_len)
 953                 goto e_free;
 954
 955         if (page_offset != 0) {
 956                 hret = h_get_24x7_catalog_page(page, catalog_version_num,
 957                                                page_offset);
 958                 if (hret) {
 959                         ret = -EIO;
 960                         goto e_free;
 961                 }
 962         }
 963
 964         copy_len = 4096 - offset_in_page;
 965         if (copy_len > count)
 966                 copy_len = count;
 967
 968         memcpy(buf, page+offset_in_page, copy_len);
 969         ret = copy_len;
 970
 971 e_free:
 972         if (hret)
 973                 pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
 974                        " rc=%ld\n",
 975                        catalog_version_num, page_offset, hret);
 976         kmem_cache_free(hv_page_cache, page);
 977
 978         pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
 979                         "catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
 980                         count, catalog_len, catalog_page_len, ret);
 981
 982         return ret;
 983 }
 984
 985 static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
 986                             char *page)
 987 {
 988         int d, n, count = 0;
 989         const char *str;
 990
 991         for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
 992                 str = domain_name(d);
 993                 if (!str)
 994                         continue;
 995
 996                 n = sprintf(page, "%d: %s\n", d, str);
 997                 if (n < 0)
 998                         break;
 999
1000                 count += n;
1001                 page += n;
1002         }
1003         return count;
1004 }
1005
1006 #define PAGE_0_ATTR(_name, _fmt, _expr)                         \
1007 static ssize_t _name##_show(struct device *dev,                 \
1008                             struct device_attribute *dev_attr,  \
1009                             char *buf)                          \
1010 {                                                               \
1011         long hret;                                              \
1012         ssize_t ret = 0;                                        \
1013         void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \
1014         struct hv_24x7_catalog_page_0 *page_0 = page;           \
1015         if (!page)                                              \
1016                 return -ENOMEM;                                 \
1017         hret = h_get_24x7_catalog_page(page, 0, 0);             \
1018         if (hret) {                                             \
1019                 ret = -EIO;                                     \
1020                 goto e_free;                                    \
1021         }                                                       \
1022         ret = sprintf(buf, _fmt, _expr);                        \
1023 e_free:                                                         \
1024         kmem_cache_free(hv_page_cache, page);                   \
1025         return ret;                                             \
1026 }                                                               \
1027 static DEVICE_ATTR_RO(_name)
1028
1029 PAGE_0_ATTR(catalog_version, "%lld\n",
1030                 (unsigned long long)be64_to_cpu(page_0->version));
1031 PAGE_0_ATTR(catalog_len, "%lld\n",
1032                 (unsigned long long)be32_to_cpu(page_0->length) * 4096);
1033 static BIN_ATTR_RO(catalog, 0/* real length varies */);
1034 static DEVICE_ATTR_RO(domains);
1035
1036 static struct bin_attribute *if_bin_attrs[] = {
1037         &bin_attr_catalog,
1038         NULL,
1039 };
1040
1041 static struct attribute *if_attrs[] = {
1042         &dev_attr_catalog_len.attr,
1043         &dev_attr_catalog_version.attr,
1044         &dev_attr_domains.attr,
1045         NULL,
1046 };
1047
1048 static struct attribute_group if_group = {
1049         .name = "interface",
1050         .bin_attrs = if_bin_attrs,
1051         .attrs = if_attrs,
1052 };
1053
1054 static const struct attribute_group *attr_groups[] = {
1055         &format_group,
1056         &event_group,
1057         &event_desc_group,
1058         &event_long_desc_group,
1059         &if_group,
1060         NULL,
1061 };
1062
1063 /*
1064  * Start the process for a new H_GET_24x7_DATA hcall.
1065  */
1066 static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1067                               struct hv_24x7_data_result_buffer *result_buffer)
1068 {
1069
1070         memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1071         memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
1072
1073         request_buffer->interface_version = interface_version;
1074         /* memset above set request_buffer->num_requests to 0 */
1075 }
1076
1077 /*
1078  * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
1079  * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
1080  */
1081 static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
1082                              struct hv_24x7_data_result_buffer *result_buffer)
1083 {
1084         long ret;
1085
1086         /*
1087          * NOTE: Due to variable number of array elements in request and
1088          *       result buffer(s), sizeof() is not reliable. Use the actual
1089          *       allocated buffer size, H24x7_DATA_BUFFER_SIZE.
1090          */
1091         ret = plpar_hcall_norets(H_GET_24X7_DATA,
1092                         virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
1093                         virt_to_phys(result_buffer),  H24x7_DATA_BUFFER_SIZE);
1094
1095         if (ret) {
1096                 struct hv_24x7_request *req;
1097
1098                 req = request_buffer->requests;
1099                 pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
1100                                       req->performance_domain, req->data_offset,
1101                                       req->starting_ix, req->starting_lpar_ix,
1102                                       ret, ret, result_buffer->detailed_rc,
1103                                       result_buffer->failing_request_ix);
1104                 return -EIO;
1105         }
1106
1107         return 0;
1108 }
1109
1110 /*
1111  * Add the given @event to the next slot in the 24x7 request_buffer.
1112  *
1113  * Note that H_GET_24X7_DATA hcall allows reading several counters'
1114  * values in a single HCALL. We expect the caller to add events to the
1115  * request buffer one by one, make the HCALL and process the results.
1116  */
1117 static int add_event_to_24x7_request(struct perf_event *event,
1118                                 struct hv_24x7_request_buffer *request_buffer)
1119 {
1120         u16 idx;
1121         int i;
1122         size_t req_size;
1123         struct hv_24x7_request *req;
1124
1125         if (request_buffer->num_requests >=
1126             max_num_requests(request_buffer->interface_version)) {
1127                 pr_devel("Too many requests for 24x7 HCALL %d\n",
1128                                 request_buffer->num_requests);
1129                 return -EINVAL;
1130         }
1131
1132         switch (event_get_domain(event)) {
1133         case HV_PERF_DOMAIN_PHYS_CHIP:
1134                 idx = event_get_chip(event);
1135                 break;
1136         case HV_PERF_DOMAIN_PHYS_CORE:
1137                 idx = event_get_core(event);
1138                 break;
1139         default:
1140                 idx = event_get_vcpu(event);
1141         }
1142
1143         req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
1144
1145         i = request_buffer->num_requests++;
1146         req = (void *) request_buffer->requests + i * req_size;
1147
1148         req->performance_domain = event_get_domain(event);
1149         req->data_size = cpu_to_be16(8);
1150         req->data_offset = cpu_to_be32(event_get_offset(event));
1151         req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
1152         req->max_num_lpars = cpu_to_be16(1);
1153         req->starting_ix = cpu_to_be16(idx);
1154         req->max_ix = cpu_to_be16(1);
1155
1156         if (request_buffer->interface_version > 1) {
1157                 if (domain_needs_aggregation(req->performance_domain))
1158                         req->max_num_thread_groups = -1;
1159                 else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
1160                         req->starting_thread_group_ix = idx % 2;
1161                         req->max_num_thread_groups = 1;
1162                 }
1163         }
1164
1165         return 0;
1166 }
1167
1168 /**
1169  * get_count_from_result - get event count from all result elements in result
1170  *
1171  * If the event corresponding to this result needs aggregation of the result
1172  * element values, then this function does that.
1173  *
1174  * @event:      Event associated with @res.
1175  * @resb:       Result buffer containing @res.
1176  * @res:        Result to work on.
1177  * @countp:     Output variable containing the event count.
1178  * @next:       Optional output variable pointing to the next result in @resb.
1179  */
1180 static int get_count_from_result(struct perf_event *event,
1181                                  struct hv_24x7_data_result_buffer *resb,
1182                                  struct hv_24x7_result *res, u64 *countp,
1183                                  struct hv_24x7_result **next)
1184 {
1185         u16 num_elements = be16_to_cpu(res->num_elements_returned);
1186         u16 data_size = be16_to_cpu(res->result_element_data_size);
1187         unsigned int data_offset;
1188         void *element_data;
1189         int i;
1190         u64 count;
1191
1192         /*
1193          * We can bail out early if the result is empty.
1194          */
1195         if (!num_elements) {
1196                 pr_debug("Result of request %hhu is empty, nothing to do\n",
1197                          res->result_ix);
1198
1199                 if (next)
1200                         *next = (struct hv_24x7_result *) res->elements;
1201
1202                 return -ENODATA;
1203         }
1204
1205         /*
1206          * Since we always specify 1 as the maximum for the smallest resource
1207          * we're requesting, there should to be only one element per result.
1208          * Except when an event needs aggregation, in which case there are more.
1209          */
1210         if (num_elements != 1 &&
1211             !domain_needs_aggregation(event_get_domain(event))) {
1212                 pr_err("Error: result of request %hhu has %hu elements\n",
1213                        res->result_ix, num_elements);
1214
1215                 return -EIO;
1216         }
1217
1218         if (data_size != sizeof(u64)) {
1219                 pr_debug("Error: result of request %hhu has data of %hu bytes\n",
1220                          res->result_ix, data_size);
1221
1222                 return -ENOTSUPP;
1223         }
1224
1225         if (resb->interface_version == 1)
1226                 data_offset = offsetof(struct hv_24x7_result_element_v1,
1227                                        element_data);
1228         else
1229                 data_offset = offsetof(struct hv_24x7_result_element_v2,
1230                                        element_data);
1231
1232         /* Go through the result elements in the result. */
1233         for (i = count = 0, element_data = res->elements + data_offset;
1234              i < num_elements;
1235              i++, element_data += data_size + data_offset)
1236                 count += be64_to_cpu(*((u64 *) element_data));
1237
1238         *countp = count;
1239
1240         /* The next result is after the last result element. */
1241         if (next)
1242                 *next = element_data - data_offset;
1243
1244         return 0;
1245 }
1246
1247 static int single_24x7_request(struct perf_event *event, u64 *count)
1248 {
1249         int ret;
1250         struct hv_24x7_request_buffer *request_buffer;
1251         struct hv_24x7_data_result_buffer *result_buffer;
1252
1253         BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
1254         BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
1255
1256         request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1257         result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1258
1259         init_24x7_request(request_buffer, result_buffer);
1260
1261         ret = add_event_to_24x7_request(event, request_buffer);
1262         if (ret)
1263                 goto out;
1264
1265         ret = make_24x7_request(request_buffer, result_buffer);
1266         if (ret)
1267                 goto out;
1268
1269         /* process result from hcall */
1270         ret = get_count_from_result(event, result_buffer,
1271                                     result_buffer->results, count, NULL);
1272
1273 out:
1274         put_cpu_var(hv_24x7_reqb);
1275         put_cpu_var(hv_24x7_resb);
1276         return ret;
1277 }
1278
1279
1280 static int h_24x7_event_init(struct perf_event *event)
1281 {
1282         struct hv_perf_caps caps;
1283         unsigned domain;
1284         unsigned long hret;
1285         u64 ct;
1286
1287         /* Not our event */
1288         if (event->attr.type != event->pmu->type)
1289                 return -ENOENT;
1290
1291         /* Unused areas must be 0 */
1292         if (event_get_reserved1(event) ||
1293             event_get_reserved2(event) ||
1294             event_get_reserved3(event)) {
1295                 pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
1296                                 event->attr.config,
1297                                 event_get_reserved1(event),
1298                                 event->attr.config1,
1299                                 event_get_reserved2(event),
1300                                 event->attr.config2,
1301                                 event_get_reserved3(event));
1302                 return -EINVAL;
1303         }
1304
1305         /* no branch sampling */
1306         if (has_branch_stack(event))
1307                 return -EOPNOTSUPP;
1308
1309         /* offset must be 8 byte aligned */
1310         if (event_get_offset(event) % 8) {
1311                 pr_devel("bad alignment\n");
1312                 return -EINVAL;
1313         }
1314
1315         domain = event_get_domain(event);
1316         if (domain >= HV_PERF_DOMAIN_MAX) {
1317                 pr_devel("invalid domain %d\n", domain);
1318                 return -EINVAL;
1319         }
1320
1321         hret = hv_perf_caps_get(&caps);
1322         if (hret) {
1323                 pr_devel("could not get capabilities: rc=%ld\n", hret);
1324                 return -EIO;
1325         }
1326
1327         /* Physical domains & other lpars require extra capabilities */
1328         if (!caps.collect_privileged && (is_physical_domain(domain) ||
1329                 (event_get_lpar(event) != event_get_lpar_max()))) {
1330                 pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
1331                                 is_physical_domain(domain),
1332                                 event_get_lpar(event));
1333                 return -EACCES;
1334         }
1335
1336         /* Get the initial value of the counter for this event */
1337         if (single_24x7_request(event, &ct)) {
1338                 pr_devel("test hcall failed\n");
1339                 return -EIO;
1340         }
1341         (void)local64_xchg(&event->hw.prev_count, ct);
1342
1343         return 0;
1344 }
1345
1346 static u64 h_24x7_get_value(struct perf_event *event)
1347 {
1348         u64 ct;
1349
1350         if (single_24x7_request(event, &ct))
1351                 /* We checked this in event init, shouldn't fail here... */
1352                 return 0;
1353
1354         return ct;
1355 }
1356
1357 static void update_event_count(struct perf_event *event, u64 now)
1358 {
1359         s64 prev;
1360
1361         prev = local64_xchg(&event->hw.prev_count, now);
1362         local64_add(now - prev, &event->count);
1363 }
1364
1365 static void h_24x7_event_read(struct perf_event *event)
1366 {
1367         u64 now;
1368         struct hv_24x7_request_buffer *request_buffer;
1369         struct hv_24x7_hw *h24x7hw;
1370         int txn_flags;
1371
1372         txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1373
1374         /*
1375          * If in a READ transaction, add this counter to the list of
1376          * counters to read during the next HCALL (i.e commit_txn()).
1377          * If not in a READ transaction, go ahead and make the HCALL
1378          * to read this counter by itself.
1379          */
1380
1381         if (txn_flags & PERF_PMU_TXN_READ) {
1382                 int i;
1383                 int ret;
1384
1385                 if (__this_cpu_read(hv_24x7_txn_err))
1386                         return;
1387
1388                 request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1389
1390                 ret = add_event_to_24x7_request(event, request_buffer);
1391                 if (ret) {
1392                         __this_cpu_write(hv_24x7_txn_err, ret);
1393                 } else {
1394                         /*
1395                          * Associate the event with the HCALL request index,
1396                          * so ->commit_txn() can quickly find/update count.
1397                          */
1398                         i = request_buffer->num_requests - 1;
1399
1400                         h24x7hw = &get_cpu_var(hv_24x7_hw);
1401                         h24x7hw->events[i] = event;
1402                         put_cpu_var(h24x7hw);
1403                         /*
1404                          * Clear the event count so we can compute the _change_
1405                          * in the 24x7 raw counter value at the end of the txn.
1406                          *
1407                          * Note that we could alternatively read the 24x7 value
1408                          * now and save its value in event->hw.prev_count. But
1409                          * that would require issuing a hcall, which would then
1410                          * defeat the purpose of using the txn interface.
1411                          */
1412                         local64_set(&event->count, 0);
1413                 }
1414
1415                 put_cpu_var(hv_24x7_reqb);
1416         } else {
1417                 now = h_24x7_get_value(event);
1418                 update_event_count(event, now);
1419         }
1420 }
1421
1422 static void h_24x7_event_start(struct perf_event *event, int flags)
1423 {
1424         if (flags & PERF_EF_RELOAD)
1425                 local64_set(&event->hw.prev_count, h_24x7_get_value(event));
1426 }
1427
1428 static void h_24x7_event_stop(struct perf_event *event, int flags)
1429 {
1430         h_24x7_event_read(event);
1431 }
1432
1433 static int h_24x7_event_add(struct perf_event *event, int flags)
1434 {
1435         if (flags & PERF_EF_START)
1436                 h_24x7_event_start(event, flags);
1437
1438         return 0;
1439 }
1440
1441 /*
1442  * 24x7 counters only support READ transactions. They are
1443  * always counting and dont need/support ADD transactions.
1444  * Cache the flags, but otherwise ignore transactions that
1445  * are not PERF_PMU_TXN_READ.
1446  */
1447 static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
1448 {
1449         struct hv_24x7_request_buffer *request_buffer;
1450         struct hv_24x7_data_result_buffer *result_buffer;
1451
1452         /* We should not be called if we are already in a txn */
1453         WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
1454
1455         __this_cpu_write(hv_24x7_txn_flags, flags);
1456         if (flags & ~PERF_PMU_TXN_READ)
1457                 return;
1458
1459         request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1460         result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1461
1462         init_24x7_request(request_buffer, result_buffer);
1463
1464         put_cpu_var(hv_24x7_resb);
1465         put_cpu_var(hv_24x7_reqb);
1466 }
1467
1468 /*
1469  * Clean up transaction state.
1470  *
1471  * NOTE: Ignore state of request and result buffers for now.
1472  *       We will initialize them during the next read/txn.
1473  */
1474 static void reset_txn(void)
1475 {
1476         __this_cpu_write(hv_24x7_txn_flags, 0);
1477         __this_cpu_write(hv_24x7_txn_err, 0);
1478 }
1479
1480 /*
1481  * 24x7 counters only support READ transactions. They are always counting
1482  * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
1483  * ignore transactions that are not of type PERF_PMU_TXN_READ.
1484  *
1485  * For READ transactions, submit all pending 24x7 requests (i.e requests
1486  * that were queued by h_24x7_event_read()), to the hypervisor and update
1487  * the event counts.
1488  */
1489 static int h_24x7_event_commit_txn(struct pmu *pmu)
1490 {
1491         struct hv_24x7_request_buffer *request_buffer;
1492         struct hv_24x7_data_result_buffer *result_buffer;
1493         struct hv_24x7_result *res, *next_res;
1494         u64 count;
1495         int i, ret, txn_flags;
1496         struct hv_24x7_hw *h24x7hw;
1497
1498         txn_flags = __this_cpu_read(hv_24x7_txn_flags);
1499         WARN_ON_ONCE(!txn_flags);
1500
1501         ret = 0;
1502         if (txn_flags & ~PERF_PMU_TXN_READ)
1503                 goto out;
1504
1505         ret = __this_cpu_read(hv_24x7_txn_err);
1506         if (ret)
1507                 goto out;
1508
1509         request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
1510         result_buffer = (void *)get_cpu_var(hv_24x7_resb);
1511
1512         ret = make_24x7_request(request_buffer, result_buffer);
1513         if (ret)
1514                 goto put_reqb;
1515
1516         h24x7hw = &get_cpu_var(hv_24x7_hw);
1517
1518         /* Go through results in the result buffer to update event counts. */
1519         for (i = 0, res = result_buffer->results;
1520              i < result_buffer->num_results; i++, res = next_res) {
1521                 struct perf_event *event = h24x7hw->events[res->result_ix];
1522
1523                 ret = get_count_from_result(event, result_buffer, res, &count,
1524                                             &next_res);
1525                 if (ret)
1526                         break;
1527
1528                 update_event_count(event, count);
1529         }
1530
1531         put_cpu_var(hv_24x7_hw);
1532
1533 put_reqb:
1534         put_cpu_var(hv_24x7_resb);
1535         put_cpu_var(hv_24x7_reqb);
1536 out:
1537         reset_txn();
1538         return ret;
1539 }
1540
1541 /*
1542  * 24x7 counters only support READ transactions. They are always counting
1543  * and dont need/support ADD transactions. However, regardless of type
1544  * of transaction, all we need to do is cleanup, so we don't have to check
1545  * the type of transaction.
1546  */
1547 static void h_24x7_event_cancel_txn(struct pmu *pmu)
1548 {
1549         WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
1550         reset_txn();
1551 }
1552
1553 static struct pmu h_24x7_pmu = {
1554         .task_ctx_nr = perf_invalid_context,
1555
1556         .name = "hv_24x7",
1557         .attr_groups = attr_groups,
1558         .event_init  = h_24x7_event_init,
1559         .add         = h_24x7_event_add,
1560         .del         = h_24x7_event_stop,
1561         .start       = h_24x7_event_start,
1562         .stop        = h_24x7_event_stop,
1563         .read        = h_24x7_event_read,
1564         .start_txn   = h_24x7_event_start_txn,
1565         .commit_txn  = h_24x7_event_commit_txn,
1566         .cancel_txn  = h_24x7_event_cancel_txn,
1567         .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
1568 };
1569
1570 static int hv_24x7_init(void)
1571 {
1572         int r;
1573         unsigned long hret;
1574         struct hv_perf_caps caps;
1575
1576         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
1577                 pr_debug("not a virtualized system, not enabling\n");
1578                 return -ENODEV;
1579         } else if (!cur_cpu_spec->oprofile_cpu_type)
1580                 return -ENODEV;
1581
1582         /* POWER8 only supports v1, while POWER9 only supports v2. */
1583         if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
1584                 interface_version = 1;
1585         else {
1586                 interface_version = 2;
1587
1588                 /* SMT8 in POWER9 needs to aggregate result elements. */
1589                 if (threads_per_core == 8)
1590                         aggregate_result_elements = true;
1591         }
1592
1593         hret = hv_perf_caps_get(&caps);
1594         if (hret) {
1595                 pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
1596                                 hret);
1597                 return -ENODEV;
1598         }
1599
1600         hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
1601         if (!hv_page_cache)
1602                 return -ENOMEM;
1603
1604         /* sampling not supported */
1605         h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1606
1607         r = create_events_from_catalog(&event_group.attrs,
1608                                    &event_desc_group.attrs,
1609                                    &event_long_desc_group.attrs);
1610
1611         if (r)
1612                 return r;
1613
1614         r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
1615         if (r)
1616                 return r;
1617
1618         return 0;
1619 }
1620
1621 device_initcall(hv_24x7_init);