tools/perf/util/mmap.c

   1 /*
   2  * Copyright (C) 2011-2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
   3  *
   4  * Parts came from evlist.c builtin-{top,stat,record}.c, see those files for further
   5  * copyright notes.
   6  *
   7  * Released under the GPL v2. (and only v2, not any later version)
   8  */
   9
  10 #include <sys/mman.h>
  11 #include <inttypes.h>
  12 #include <asm/bug.h>
  13 #ifdef HAVE_LIBNUMA_SUPPORT
  14 #include <numaif.h>
  15 #endif
  16 #include "debug.h"
  17 #include "event.h"
  18 #include "mmap.h"
  19 #include "util.h" /* page_size */
  20
  21 size_t perf_mmap__mmap_len(struct perf_mmap *map)
  22 {
  23         return map->mask + 1 + page_size;
  24 }
  25
  26 /* When check_messup is true, 'end' must points to a good entry */
  27 static union perf_event *perf_mmap__read(struct perf_mmap *map,
  28                                          u64 *startp, u64 end)
  29 {
  30         unsigned char *data = map->base + page_size;
  31         union perf_event *event = NULL;
  32         int diff = end - *startp;
  33
  34         if (diff >= (int)sizeof(event->header)) {
  35                 size_t size;
  36
  37                 event = (union perf_event *)&data[*startp & map->mask];
  38                 size = event->header.size;
  39
  40                 if (size < sizeof(event->header) || diff < (int)size)
  41                         return NULL;
  42
  43                 /*
  44                  * Event straddles the mmap boundary -- header should always
  45                  * be inside due to u64 alignment of output.
  46                  */
  47                 if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) {
  48                         unsigned int offset = *startp;
  49                         unsigned int len = min(sizeof(*event), size), cpy;
  50                         void *dst = map->event_copy;
  51
  52                         do {
  53                                 cpy = min(map->mask + 1 - (offset & map->mask), len);
  54                                 memcpy(dst, &data[offset & map->mask], cpy);
  55                                 offset += cpy;
  56                                 dst += cpy;
  57                                 len -= cpy;
  58                         } while (len);
  59
  60                         event = (union perf_event *)map->event_copy;
  61                 }
  62
  63                 *startp += size;
  64         }
  65
  66         return event;
  67 }
  68
  69 /*
  70  * Read event from ring buffer one by one.
  71  * Return one event for each call.
  72  *
  73  * Usage:
  74  * perf_mmap__read_init()
  75  * while(event = perf_mmap__read_event()) {
  76  *      //process the event
  77  *      perf_mmap__consume()
  78  * }
  79  * perf_mmap__read_done()
  80  */
  81 union perf_event *perf_mmap__read_event(struct perf_mmap *map)
  82 {
  83         union perf_event *event;
  84
  85         /*
  86          * Check if event was unmapped due to a POLLHUP/POLLERR.
  87          */
  88         if (!refcount_read(&map->refcnt))
  89                 return NULL;
  90
  91         /* non-overwirte doesn't pause the ringbuffer */
  92         if (!map->overwrite)
  93                 map->end = perf_mmap__read_head(map);
  94
  95         event = perf_mmap__read(map, &map->start, map->end);
  96
  97         if (!map->overwrite)
  98                 map->prev = map->start;
  99
 100         return event;
 101 }
 102
 103 static bool perf_mmap__empty(struct perf_mmap *map)
 104 {
 105         return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
 106 }
 107
 108 void perf_mmap__get(struct perf_mmap *map)
 109 {
 110         refcount_inc(&map->refcnt);
 111 }
 112
 113 void perf_mmap__put(struct perf_mmap *map)
 114 {
 115         BUG_ON(map->base && refcount_read(&map->refcnt) == 0);
 116
 117         if (refcount_dec_and_test(&map->refcnt))
 118                 perf_mmap__munmap(map);
 119 }
 120
 121 void perf_mmap__consume(struct perf_mmap *map)
 122 {
 123         if (!map->overwrite) {
 124                 u64 old = map->prev;
 125
 126                 perf_mmap__write_tail(map, old);
 127         }
 128
 129         if (refcount_read(&map->refcnt) == 1 && perf_mmap__empty(map))
 130                 perf_mmap__put(map);
 131 }
 132
 133 int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
 134                                struct auxtrace_mmap_params *mp __maybe_unused,
 135                                void *userpg __maybe_unused,
 136                                int fd __maybe_unused)
 137 {
 138         return 0;
 139 }
 140
 141 void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
 142 {
 143 }
 144
 145 void __weak auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp __maybe_unused,
 146                                        off_t auxtrace_offset __maybe_unused,
 147                                        unsigned int auxtrace_pages __maybe_unused,
 148                                        bool auxtrace_overwrite __maybe_unused)
 149 {
 150 }
 151
 152 void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __maybe_unused,
 153                                           struct perf_evlist *evlist __maybe_unused,
 154                                           int idx __maybe_unused,
 155                                           bool per_cpu __maybe_unused)
 156 {
 157 }
 158
 159 #ifdef HAVE_AIO_SUPPORT
 160 static int perf_mmap__aio_enabled(struct perf_mmap *map)
 161 {
 162         return map->aio.nr_cblocks > 0;
 163 }
 164
 165 #ifdef HAVE_LIBNUMA_SUPPORT
 166 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
 167 {
 168         map->aio.data[idx] = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
 169                                   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
 170         if (map->aio.data[idx] == MAP_FAILED) {
 171                 map->aio.data[idx] = NULL;
 172                 return -1;
 173         }
 174
 175         return 0;
 176 }
 177
 178 static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
 179 {
 180         if (map->aio.data[idx]) {
 181                 munmap(map->aio.data[idx], perf_mmap__mmap_len(map));
 182                 map->aio.data[idx] = NULL;
 183         }
 184 }
 185
 186 static int perf_mmap__aio_bind(struct perf_mmap *map, int idx, int cpu, int affinity)
 187 {
 188         void *data;
 189         size_t mmap_len;
 190         unsigned long node_mask;
 191
 192         if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
 193                 data = map->aio.data[idx];
 194                 mmap_len = perf_mmap__mmap_len(map);
 195                 node_mask = 1UL << cpu__get_node(cpu);
 196                 if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
 197                         pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n",
 198                                 data, data + mmap_len, cpu__get_node(cpu));
 199                         return -1;
 200                 }
 201         }
 202
 203         return 0;
 204 }
 205 #else /* !HAVE_LIBNUMA_SUPPORT */
 206 static int perf_mmap__aio_alloc(struct perf_mmap *map, int idx)
 207 {
 208         map->aio.data[idx] = malloc(perf_mmap__mmap_len(map));
 209         if (map->aio.data[idx] == NULL)
 210                 return -1;
 211
 212         return 0;
 213 }
 214
 215 static void perf_mmap__aio_free(struct perf_mmap *map, int idx)
 216 {
 217         zfree(&(map->aio.data[idx]));
 218 }
 219
 220 static int perf_mmap__aio_bind(struct perf_mmap *map __maybe_unused, int idx __maybe_unused,
 221                 int cpu __maybe_unused, int affinity __maybe_unused)
 222 {
 223         return 0;
 224 }
 225 #endif
 226
 227 static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
 228 {
 229         int delta_max, i, prio, ret;
 230
 231         map->aio.nr_cblocks = mp->nr_cblocks;
 232         if (map->aio.nr_cblocks) {
 233                 map->aio.aiocb = calloc(map->aio.nr_cblocks, sizeof(struct aiocb *));
 234                 if (!map->aio.aiocb) {
 235                         pr_debug2("failed to allocate aiocb for data buffer, error %m\n");
 236                         return -1;
 237                 }
 238                 map->aio.cblocks = calloc(map->aio.nr_cblocks, sizeof(struct aiocb));
 239                 if (!map->aio.cblocks) {
 240                         pr_debug2("failed to allocate cblocks for data buffer, error %m\n");
 241                         return -1;
 242                 }
 243                 map->aio.data = calloc(map->aio.nr_cblocks, sizeof(void *));
 244                 if (!map->aio.data) {
 245                         pr_debug2("failed to allocate data buffer, error %m\n");
 246                         return -1;
 247                 }
 248                 delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
 249                 for (i = 0; i < map->aio.nr_cblocks; ++i) {
 250                         ret = perf_mmap__aio_alloc(map, i);
 251                         if (ret == -1) {
 252                                 pr_debug2("failed to allocate data buffer area, error %m");
 253                                 return -1;
 254                         }
 255                         ret = perf_mmap__aio_bind(map, i, map->cpu, mp->affinity);
 256                         if (ret == -1)
 257                                 return -1;
 258                         /*
 259                          * Use cblock.aio_fildes value different from -1
 260                          * to denote started aio write operation on the
 261                          * cblock so it requires explicit record__aio_sync()
 262                          * call prior the cblock may be reused again.
 263                          */
 264                         map->aio.cblocks[i].aio_fildes = -1;
 265                         /*
 266                          * Allocate cblocks with priority delta to have
 267                          * faster aio write system calls because queued requests
 268                          * are kept in separate per-prio queues and adding
 269                          * a new request will iterate thru shorter per-prio
 270                          * list. Blocks with numbers higher than
 271                          *  _SC_AIO_PRIO_DELTA_MAX go with priority 0.
 272                          */
 273                         prio = delta_max - i;
 274                         map->aio.cblocks[i].aio_reqprio = prio >= 0 ? prio : 0;
 275                 }
 276         }
 277
 278         return 0;
 279 }
 280
 281 static void perf_mmap__aio_munmap(struct perf_mmap *map)
 282 {
 283         int i;
 284
 285         for (i = 0; i < map->aio.nr_cblocks; ++i)
 286                 perf_mmap__aio_free(map, i);
 287         if (map->aio.data)
 288                 zfree(&map->aio.data);
 289         zfree(&map->aio.cblocks);
 290         zfree(&map->aio.aiocb);
 291 }
 292 #else /* !HAVE_AIO_SUPPORT */
 293 static int perf_mmap__aio_enabled(struct perf_mmap *map __maybe_unused)
 294 {
 295         return 0;
 296 }
 297
 298 static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused,
 299                                struct mmap_params *mp __maybe_unused)
 300 {
 301         return 0;
 302 }
 303
 304 static void perf_mmap__aio_munmap(struct perf_mmap *map __maybe_unused)
 305 {
 306 }
 307 #endif
 308
 309 void perf_mmap__munmap(struct perf_mmap *map)
 310 {
 311         perf_mmap__aio_munmap(map);
 312         if (map->data != NULL) {
 313                 munmap(map->data, perf_mmap__mmap_len(map));
 314                 map->data = NULL;
 315         }
 316         if (map->base != NULL) {
 317                 munmap(map->base, perf_mmap__mmap_len(map));
 318                 map->base = NULL;
 319                 map->fd = -1;
 320                 refcount_set(&map->refcnt, 0);
 321         }
 322         auxtrace_mmap__munmap(&map->auxtrace_mmap);
 323 }
 324
 325 static void build_node_mask(int node, cpu_set_t *mask)
 326 {
 327         int c, cpu, nr_cpus;
 328         const struct cpu_map *cpu_map = NULL;
 329
 330         cpu_map = cpu_map__online();
 331         if (!cpu_map)
 332                 return;
 333
 334         nr_cpus = cpu_map__nr(cpu_map);
 335         for (c = 0; c < nr_cpus; c++) {
 336                 cpu = cpu_map->map[c]; /* map c index to online cpu index */
 337                 if (cpu__get_node(cpu) == node)
 338                         CPU_SET(cpu, mask);
 339         }
 340 }
 341
 342 static void perf_mmap__setup_affinity_mask(struct perf_mmap *map, struct mmap_params *mp)
 343 {
 344         CPU_ZERO(&map->affinity_mask);
 345         if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
 346                 build_node_mask(cpu__get_node(map->cpu), &map->affinity_mask);
 347         else if (mp->affinity == PERF_AFFINITY_CPU)
 348                 CPU_SET(map->cpu, &map->affinity_mask);
 349 }
 350
 351 int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd, int cpu)
 352 {
 353         /*
 354          * The last one will be done at perf_mmap__consume(), so that we
 355          * make sure we don't prevent tools from consuming every last event in
 356          * the ring buffer.
 357          *
 358          * I.e. we can get the POLLHUP meaning that the fd doesn't exist
 359          * anymore, but the last events for it are still in the ring buffer,
 360          * waiting to be consumed.
 361          *
 362          * Tools can chose to ignore this at their own discretion, but the
 363          * evlist layer can't just drop it when filtering events in
 364          * perf_evlist__filter_pollfd().
 365          */
 366         refcount_set(&map->refcnt, 2);
 367         map->prev = 0;
 368         map->mask = mp->mask;
 369         map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
 370                          MAP_SHARED, fd, 0);
 371         if (map->base == MAP_FAILED) {
 372                 pr_debug2("failed to mmap perf event ring buffer, error %d\n",
 373                           errno);
 374                 map->base = NULL;
 375                 return -1;
 376         }
 377         map->fd = fd;
 378         map->cpu = cpu;
 379
 380         perf_mmap__setup_affinity_mask(map, mp);
 381
 382         map->flush = mp->flush;
 383
 384         map->comp_level = mp->comp_level;
 385
 386         if (map->comp_level && !perf_mmap__aio_enabled(map)) {
 387                 map->data = mmap(NULL, perf_mmap__mmap_len(map), PROT_READ|PROT_WRITE,
 388                                  MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
 389                 if (map->data == MAP_FAILED) {
 390                         pr_debug2("failed to mmap data buffer, error %d\n",
 391                                         errno);
 392                         map->data = NULL;
 393                         return -1;
 394                 }
 395         }
 396
 397         if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
 398                                 &mp->auxtrace_mp, map->base, fd))
 399                 return -1;
 400
 401         return perf_mmap__aio_mmap(map, mp);
 402 }
 403
 404 static int overwrite_rb_find_range(void *buf, int mask, u64 *start, u64 *end)
 405 {
 406         struct perf_event_header *pheader;
 407         u64 evt_head = *start;
 408         int size = mask + 1;
 409
 410         pr_debug2("%s: buf=%p, start=%"PRIx64"\n", __func__, buf, *start);
 411         pheader = (struct perf_event_header *)(buf + (*start & mask));
 412         while (true) {
 413                 if (evt_head - *start >= (unsigned int)size) {
 414                         pr_debug("Finished reading overwrite ring buffer: rewind\n");
 415                         if (evt_head - *start > (unsigned int)size)
 416                                 evt_head -= pheader->size;
 417                         *end = evt_head;
 418                         return 0;
 419                 }
 420
 421                 pheader = (struct perf_event_header *)(buf + (evt_head & mask));
 422
 423                 if (pheader->size == 0) {
 424                         pr_debug("Finished reading overwrite ring buffer: get start\n");
 425                         *end = evt_head;
 426                         return 0;
 427                 }
 428
 429                 evt_head += pheader->size;
 430                 pr_debug3("move evt_head: %"PRIx64"\n", evt_head);
 431         }
 432         WARN_ONCE(1, "Shouldn't get here\n");
 433         return -1;
 434 }
 435
 436 /*
 437  * Report the start and end of the available data in ringbuffer
 438  */
 439 static int __perf_mmap__read_init(struct perf_mmap *md)
 440 {
 441         u64 head = perf_mmap__read_head(md);
 442         u64 old = md->prev;
 443         unsigned char *data = md->base + page_size;
 444         unsigned long size;
 445
 446         md->start = md->overwrite ? head : old;
 447         md->end = md->overwrite ? old : head;
 448
 449         if ((md->end - md->start) < md->flush)
 450                 return -EAGAIN;
 451
 452         size = md->end - md->start;
 453         if (size > (unsigned long)(md->mask) + 1) {
 454                 if (!md->overwrite) {
 455                         WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
 456
 457                         md->prev = head;
 458                         perf_mmap__consume(md);
 459                         return -EAGAIN;
 460                 }
 461
 462                 /*
 463                  * Backward ring buffer is full. We still have a chance to read
 464                  * most of data from it.
 465                  */
 466                 if (overwrite_rb_find_range(data, md->mask, &md->start, &md->end))
 467                         return -EINVAL;
 468         }
 469
 470         return 0;
 471 }
 472
 473 int perf_mmap__read_init(struct perf_mmap *map)
 474 {
 475         /*
 476          * Check if event was unmapped due to a POLLHUP/POLLERR.
 477          */
 478         if (!refcount_read(&map->refcnt))
 479                 return -ENOENT;
 480
 481         return __perf_mmap__read_init(map);
 482 }
 483
 484 int perf_mmap__push(struct perf_mmap *md, void *to,
 485                     int push(struct perf_mmap *map, void *to, void *buf, size_t size))
 486 {
 487         u64 head = perf_mmap__read_head(md);
 488         unsigned char *data = md->base + page_size;
 489         unsigned long size;
 490         void *buf;
 491         int rc = 0;
 492
 493         rc = perf_mmap__read_init(md);
 494         if (rc < 0)
 495                 return (rc == -EAGAIN) ? 1 : -1;
 496
 497         size = md->end - md->start;
 498
 499         if ((md->start & md->mask) + size != (md->end & md->mask)) {
 500                 buf = &data[md->start & md->mask];
 501                 size = md->mask + 1 - (md->start & md->mask);
 502                 md->start += size;
 503
 504                 if (push(md, to, buf, size) < 0) {
 505                         rc = -1;
 506                         goto out;
 507                 }
 508         }
 509
 510         buf = &data[md->start & md->mask];
 511         size = md->end - md->start;
 512         md->start += size;
 513
 514         if (push(md, to, buf, size) < 0) {
 515                 rc = -1;
 516                 goto out;
 517         }
 518
 519         md->prev = head;
 520         perf_mmap__consume(md);
 521 out:
 522         return rc;
 523 }
 524
 525 /*
 526  * Mandatory for overwrite mode
 527  * The direction of overwrite mode is backward.
 528  * The last perf_mmap__read() will set tail to map->prev.
 529  * Need to correct the map->prev to head which is the end of next read.
 530  */
 531 void perf_mmap__read_done(struct perf_mmap *map)
 532 {
 533         /*
 534          * Check if event was unmapped due to a POLLHUP/POLLERR.
 535          */
 536         if (!refcount_read(&map->refcnt))
 537                 return;
 538
 539         map->prev = perf_mmap__read_head(map);
 540 }