tools/testing/selftests/kvm/dirty_log_perf_test.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * KVM dirty page logging performance test
   4  *
   5  * Based on dirty_log_test.c
   6  *
   7  * Copyright (C) 2018, Red Hat, Inc.
   8  * Copyright (C) 2020, Google, Inc.
   9  */
  10
  11 #include <stdio.h>
  12 #include <stdlib.h>
  13 #include <time.h>
  14 #include <pthread.h>
  15 #include <linux/bitmap.h>
  16
  17 #include "kvm_util.h"
  18 #include "test_util.h"
  19 #include "memstress.h"
  20 #include "guest_modes.h"
  21 #include "ucall_common.h"
  22
  23 #ifdef __aarch64__
  24 #include "aarch64/vgic.h"
  25
  26 static int gic_fd;
  27
  28 static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus)
  29 {
  30         /*
  31          * The test can still run even if hardware does not support GICv3, as it
  32          * is only an optimization to reduce guest exits.
  33          */
  34         gic_fd = vgic_v3_setup(vm, nr_vcpus, 64);
  35 }
  36
  37 static void arch_cleanup_vm(struct kvm_vm *vm)
  38 {
  39         if (gic_fd > 0)
  40                 close(gic_fd);
  41 }
  42
  43 #else /* __aarch64__ */
  44
  45 static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus)
  46 {
  47 }
  48
  49 static void arch_cleanup_vm(struct kvm_vm *vm)
  50 {
  51 }
  52
  53 #endif
  54
  55 /* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
  56 #define TEST_HOST_LOOP_N                2UL
  57
  58 static int nr_vcpus = 1;
  59 static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
  60 static bool run_vcpus_while_disabling_dirty_logging;
  61
  62 /* Host variables */
  63 static u64 dirty_log_manual_caps;
  64 static bool host_quit;
  65 static int iteration;
  66 static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
  67
  68 static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
  69 {
  70         struct kvm_vcpu *vcpu = vcpu_args->vcpu;
  71         int vcpu_idx = vcpu_args->vcpu_idx;
  72         uint64_t pages_count = 0;
  73         struct kvm_run *run;
  74         struct timespec start;
  75         struct timespec ts_diff;
  76         struct timespec total = (struct timespec){0};
  77         struct timespec avg;
  78         int ret;
  79
  80         run = vcpu->run;
  81
  82         while (!READ_ONCE(host_quit)) {
  83                 int current_iteration = READ_ONCE(iteration);
  84
  85                 clock_gettime(CLOCK_MONOTONIC, &start);
  86                 ret = _vcpu_run(vcpu);
  87                 ts_diff = timespec_elapsed(start);
  88
  89                 TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret);
  90                 TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
  91                             "Invalid guest sync status: exit_reason=%s",
  92                             exit_reason_str(run->exit_reason));
  93
  94                 pr_debug("Got sync event from vCPU %d\n", vcpu_idx);
  95                 vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
  96                 pr_debug("vCPU %d updated last completed iteration to %d\n",
  97                          vcpu_idx, vcpu_last_completed_iteration[vcpu_idx]);
  98
  99                 if (current_iteration) {
 100                         pages_count += vcpu_args->pages;
 101                         total = timespec_add(total, ts_diff);
 102                         pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n",
 103                                 vcpu_idx, current_iteration, ts_diff.tv_sec,
 104                                 ts_diff.tv_nsec);
 105                 } else {
 106                         pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n",
 107                                 vcpu_idx, current_iteration, ts_diff.tv_sec,
 108                                 ts_diff.tv_nsec);
 109                 }
 110
 111                 /*
 112                  * Keep running the guest while dirty logging is being disabled
 113                  * (iteration is negative) so that vCPUs are accessing memory
 114                  * for the entire duration of zapping collapsible SPTEs.
 115                  */
 116                 while (current_iteration == READ_ONCE(iteration) &&
 117                        READ_ONCE(iteration) >= 0 && !READ_ONCE(host_quit)) {}
 118         }
 119
 120         avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_idx]);
 121         pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
 122                 vcpu_idx, pages_count, vcpu_last_completed_iteration[vcpu_idx],
 123                 total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
 124 }
 125
 126 struct test_params {
 127         unsigned long iterations;
 128         uint64_t phys_offset;
 129         bool partition_vcpu_memory_access;
 130         enum vm_mem_backing_src_type backing_src;
 131         int slots;
 132         uint32_t write_percent;
 133         bool random_access;
 134 };
 135
 136 static void run_test(enum vm_guest_mode mode, void *arg)
 137 {
 138         struct test_params *p = arg;
 139         struct kvm_vm *vm;
 140         unsigned long **bitmaps;
 141         uint64_t guest_num_pages;
 142         uint64_t host_num_pages;
 143         uint64_t pages_per_slot;
 144         struct timespec start;
 145         struct timespec ts_diff;
 146         struct timespec get_dirty_log_total = (struct timespec){0};
 147         struct timespec vcpu_dirty_total = (struct timespec){0};
 148         struct timespec avg;
 149         struct timespec clear_dirty_log_total = (struct timespec){0};
 150         int i;
 151
 152         vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
 153                                  p->slots, p->backing_src,
 154                                  p->partition_vcpu_memory_access);
 155
 156         memstress_set_write_percent(vm, p->write_percent);
 157
 158         guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift;
 159         guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
 160         host_num_pages = vm_num_host_pages(mode, guest_num_pages);
 161         pages_per_slot = host_num_pages / p->slots;
 162
 163         bitmaps = memstress_alloc_bitmaps(p->slots, pages_per_slot);
 164
 165         if (dirty_log_manual_caps)
 166                 vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
 167                               dirty_log_manual_caps);
 168
 169         arch_setup_vm(vm, nr_vcpus);
 170
 171         /* Start the iterations */
 172         iteration = 0;
 173         host_quit = false;
 174
 175         clock_gettime(CLOCK_MONOTONIC, &start);
 176         for (i = 0; i < nr_vcpus; i++)
 177                 vcpu_last_completed_iteration[i] = -1;
 178
 179         /*
 180          * Use 100% writes during the population phase to ensure all
 181          * memory is actually populated and not just mapped to the zero
 182          * page. The prevents expensive copy-on-write faults from
 183          * occurring during the dirty memory iterations below, which
 184          * would pollute the performance results.
 185          */
 186         memstress_set_write_percent(vm, 100);
 187         memstress_set_random_access(vm, false);
 188         memstress_start_vcpu_threads(nr_vcpus, vcpu_worker);
 189
 190         /* Allow the vCPUs to populate memory */
 191         pr_debug("Starting iteration %d - Populating\n", iteration);
 192         for (i = 0; i < nr_vcpus; i++) {
 193                 while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
 194                        iteration)
 195                         ;
 196         }
 197
 198         ts_diff = timespec_elapsed(start);
 199         pr_info("Populate memory time: %ld.%.9lds\n",
 200                 ts_diff.tv_sec, ts_diff.tv_nsec);
 201
 202         /* Enable dirty logging */
 203         clock_gettime(CLOCK_MONOTONIC, &start);
 204         memstress_enable_dirty_logging(vm, p->slots);
 205         ts_diff = timespec_elapsed(start);
 206         pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
 207                 ts_diff.tv_sec, ts_diff.tv_nsec);
 208
 209         memstress_set_write_percent(vm, p->write_percent);
 210         memstress_set_random_access(vm, p->random_access);
 211
 212         while (iteration < p->iterations) {
 213                 /*
 214                  * Incrementing the iteration number will start the vCPUs
 215                  * dirtying memory again.
 216                  */
 217                 clock_gettime(CLOCK_MONOTONIC, &start);
 218                 iteration++;
 219
 220                 pr_debug("Starting iteration %d\n", iteration);
 221                 for (i = 0; i < nr_vcpus; i++) {
 222                         while (READ_ONCE(vcpu_last_completed_iteration[i])
 223                                != iteration)
 224                                 ;
 225                 }
 226
 227                 ts_diff = timespec_elapsed(start);
 228                 vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
 229                 pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
 230                         iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 231
 232                 clock_gettime(CLOCK_MONOTONIC, &start);
 233                 memstress_get_dirty_log(vm, bitmaps, p->slots);
 234                 ts_diff = timespec_elapsed(start);
 235                 get_dirty_log_total = timespec_add(get_dirty_log_total,
 236                                                    ts_diff);
 237                 pr_info("Iteration %d get dirty log time: %ld.%.9lds\n",
 238                         iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 239
 240                 if (dirty_log_manual_caps) {
 241                         clock_gettime(CLOCK_MONOTONIC, &start);
 242                         memstress_clear_dirty_log(vm, bitmaps, p->slots,
 243                                                   pages_per_slot);
 244                         ts_diff = timespec_elapsed(start);
 245                         clear_dirty_log_total = timespec_add(clear_dirty_log_total,
 246                                                              ts_diff);
 247                         pr_info("Iteration %d clear dirty log time: %ld.%.9lds\n",
 248                                 iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 249                 }
 250         }
 251
 252         /*
 253          * Run vCPUs while dirty logging is being disabled to stress disabling
 254          * in terms of both performance and correctness.  Opt-in via command
 255          * line as this significantly increases time to disable dirty logging.
 256          */
 257         if (run_vcpus_while_disabling_dirty_logging)
 258                 WRITE_ONCE(iteration, -1);
 259
 260         /* Disable dirty logging */
 261         clock_gettime(CLOCK_MONOTONIC, &start);
 262         memstress_disable_dirty_logging(vm, p->slots);
 263         ts_diff = timespec_elapsed(start);
 264         pr_info("Disabling dirty logging time: %ld.%.9lds\n",
 265                 ts_diff.tv_sec, ts_diff.tv_nsec);
 266
 267         /*
 268          * Tell the vCPU threads to quit.  No need to manually check that vCPUs
 269          * have stopped running after disabling dirty logging, the join will
 270          * wait for them to exit.
 271          */
 272         host_quit = true;
 273         memstress_join_vcpu_threads(nr_vcpus);
 274
 275         avg = timespec_div(get_dirty_log_total, p->iterations);
 276         pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
 277                 p->iterations, get_dirty_log_total.tv_sec,
 278                 get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
 279
 280         if (dirty_log_manual_caps) {
 281                 avg = timespec_div(clear_dirty_log_total, p->iterations);
 282                 pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
 283                         p->iterations, clear_dirty_log_total.tv_sec,
 284                         clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
 285         }
 286
 287         memstress_free_bitmaps(bitmaps, p->slots);
 288         arch_cleanup_vm(vm);
 289         memstress_destroy_vm(vm);
 290 }
 291
 292 static void help(char *name)
 293 {
 294         puts("");
 295         printf("usage: %s [-h] [-a] [-i iterations] [-p offset] [-g] "
 296                "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]"
 297                "[-x memslots] [-w percentage] [-c physical cpus to run test on]\n", name);
 298         puts("");
 299         printf(" -a: access memory randomly rather than in order.\n");
 300         printf(" -i: specify iteration counts (default: %"PRIu64")\n",
 301                TEST_HOST_LOOP_N);
 302         printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n"
 303                "     makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n"
 304                "     KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n"
 305                "     and writes will be tracked as soon as dirty logging is\n"
 306                "     enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n"
 307                "     is not enabled).\n");
 308         printf(" -p: specify guest physical test memory offset\n"
 309                "     Warning: a low offset can conflict with the loaded test code.\n");
 310         guest_modes_help();
 311         printf(" -n: Run the vCPUs in nested mode (L2)\n");
 312         printf(" -e: Run vCPUs while dirty logging is being disabled.  This\n"
 313                "     can significantly increase runtime, especially if there\n"
 314                "     isn't a dedicated pCPU for the main thread.\n");
 315         printf(" -b: specify the size of the memory region which should be\n"
 316                "     dirtied by each vCPU. e.g. 10M or 3G.\n"
 317                "     (default: 1G)\n");
 318         printf(" -v: specify the number of vCPUs to run.\n");
 319         printf(" -o: Overlap guest memory accesses instead of partitioning\n"
 320                "     them into a separate region of memory for each vCPU.\n");
 321         printf(" -r: specify the starting random seed.\n");
 322         backing_src_help("-s");
 323         printf(" -x: Split the memory region into this number of memslots.\n"
 324                "     (default: 1)\n");
 325         printf(" -w: specify the percentage of pages which should be written to\n"
 326                "     as an integer from 0-100 inclusive. This is probabilistic,\n"
 327                "     so -w X means each page has an X%% chance of writing\n"
 328                "     and a (100-X)%% chance of reading.\n"
 329                "     (default: 100 i.e. all pages are written to.)\n");
 330         kvm_print_vcpu_pinning_help();
 331         puts("");
 332         exit(0);
 333 }
 334
 335 int main(int argc, char *argv[])
 336 {
 337         int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
 338         const char *pcpu_list = NULL;
 339         struct test_params p = {
 340                 .iterations = TEST_HOST_LOOP_N,
 341                 .partition_vcpu_memory_access = true,
 342                 .backing_src = DEFAULT_VM_MEM_SRC,
 343                 .slots = 1,
 344                 .write_percent = 100,
 345         };
 346         int opt;
 347
 348         /* Override the seed to be deterministic by default. */
 349         guest_random_seed = 1;
 350
 351         dirty_log_manual_caps =
 352                 kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
 353         dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
 354                                   KVM_DIRTY_LOG_INITIALLY_SET);
 355
 356         guest_modes_append_default();
 357
 358         while ((opt = getopt(argc, argv, "ab:c:eghi:m:nop:r:s:v:x:w:")) != -1) {
 359                 switch (opt) {
 360                 case 'a':
 361                         p.random_access = true;
 362                         break;
 363                 case 'b':
 364                         guest_percpu_mem_size = parse_size(optarg);
 365                         break;
 366                 case 'c':
 367                         pcpu_list = optarg;
 368                         break;
 369                 case 'e':
 370                         /* 'e' is for evil. */
 371                         run_vcpus_while_disabling_dirty_logging = true;
 372                         break;
 373                 case 'g':
 374                         dirty_log_manual_caps = 0;
 375                         break;
 376                 case 'h':
 377                         help(argv[0]);
 378                         break;
 379                 case 'i':
 380                         p.iterations = atoi_positive("Number of iterations", optarg);
 381                         break;
 382                 case 'm':
 383                         guest_modes_cmdline(optarg);
 384                         break;
 385                 case 'n':
 386                         memstress_args.nested = true;
 387                         break;
 388                 case 'o':
 389                         p.partition_vcpu_memory_access = false;
 390                         break;
 391                 case 'p':
 392                         p.phys_offset = strtoull(optarg, NULL, 0);
 393                         break;
 394                 case 'r':
 395                         guest_random_seed = atoi_positive("Random seed", optarg);
 396                         break;
 397                 case 's':
 398                         p.backing_src = parse_backing_src_type(optarg);
 399                         break;
 400                 case 'v':
 401                         nr_vcpus = atoi_positive("Number of vCPUs", optarg);
 402                         TEST_ASSERT(nr_vcpus <= max_vcpus,
 403                                     "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
 404                         break;
 405                 case 'w':
 406                         p.write_percent = atoi_non_negative("Write percentage", optarg);
 407                         TEST_ASSERT(p.write_percent <= 100,
 408                                     "Write percentage must be between 0 and 100");
 409                         break;
 410                 case 'x':
 411                         p.slots = atoi_positive("Number of slots", optarg);
 412                         break;
 413                 default:
 414                         help(argv[0]);
 415                         break;
 416                 }
 417         }
 418
 419         if (pcpu_list) {
 420                 kvm_parse_vcpu_pinning(pcpu_list, memstress_args.vcpu_to_pcpu,
 421                                        nr_vcpus);
 422                 memstress_args.pin_vcpus = true;
 423         }
 424
 425         TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations");
 426
 427         pr_info("Test iterations: %"PRIu64"\n", p.iterations);
 428
 429         for_each_guest_mode(run_test, &p);
 430
 431         return 0;
 432 }