tools/testing/selftests/kvm/rseq_test.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2
   3 /*
   4  * Include rseq.c without _GNU_SOURCE defined, before including any headers, so
   5  * that rseq.c is compiled with its configuration, not KVM selftests' config.
   6  */
   7 #undef _GNU_SOURCE
   8 #include "../rseq/rseq.c"
   9 #define _GNU_SOURCE
  10
  11 #include <errno.h>
  12 #include <fcntl.h>
  13 #include <pthread.h>
  14 #include <sched.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18 #include <signal.h>
  19 #include <syscall.h>
  20 #include <sys/ioctl.h>
  21 #include <sys/sysinfo.h>
  22 #include <asm/barrier.h>
  23 #include <linux/atomic.h>
  24 #include <linux/rseq.h>
  25 #include <linux/unistd.h>
  26
  27 #include "kvm_util.h"
  28 #include "processor.h"
  29 #include "test_util.h"
  30 #include "ucall_common.h"
  31
  32 /*
  33  * Any bug related to task migration is likely to be timing-dependent; perform
  34  * a large number of migrations to reduce the odds of a false negative.
  35  */
  36 #define NR_TASK_MIGRATIONS 100000
  37
  38 static pthread_t migration_thread;
  39 static cpu_set_t possible_mask;
  40 static int min_cpu, max_cpu;
  41 static bool done;
  42
  43 static atomic_t seq_cnt;
  44
  45 static void guest_code(void)
  46 {
  47         for (;;)
  48                 GUEST_SYNC(0);
  49 }
  50
  51 static int next_cpu(int cpu)
  52 {
  53         /*
  54          * Advance to the next CPU, skipping those that weren't in the original
  55          * affinity set.  Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's
  56          * data storage is considered as opaque.  Note, if this task is pinned
  57          * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will
  58          * burn a lot cycles and the test will take longer than normal to
  59          * complete.
  60          */
  61         do {
  62                 cpu++;
  63                 if (cpu > max_cpu) {
  64                         cpu = min_cpu;
  65                         TEST_ASSERT(CPU_ISSET(cpu, &possible_mask),
  66                                     "Min CPU = %d must always be usable", cpu);
  67                         break;
  68                 }
  69         } while (!CPU_ISSET(cpu, &possible_mask));
  70
  71         return cpu;
  72 }
  73
  74 static void *migration_worker(void *__rseq_tid)
  75 {
  76         pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid;
  77         cpu_set_t allowed_mask;
  78         int r, i, cpu;
  79
  80         CPU_ZERO(&allowed_mask);
  81
  82         for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) {
  83                 CPU_SET(cpu, &allowed_mask);
  84
  85                 /*
  86                  * Bump the sequence count twice to allow the reader to detect
  87                  * that a migration may have occurred in between rseq and sched
  88                  * CPU ID reads.  An odd sequence count indicates a migration
  89                  * is in-progress, while a completely different count indicates
  90                  * a migration occurred since the count was last read.
  91                  */
  92                 atomic_inc(&seq_cnt);
  93
  94                 /*
  95                  * Ensure the odd count is visible while getcpu() isn't
  96                  * stable, i.e. while changing affinity is in-progress.
  97                  */
  98                 smp_wmb();
  99                 r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask);
 100                 TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)",
 101                             errno, strerror(errno));
 102                 smp_wmb();
 103                 atomic_inc(&seq_cnt);
 104
 105                 CPU_CLR(cpu, &allowed_mask);
 106
 107                 /*
 108                  * Wait 1-10us before proceeding to the next iteration and more
 109                  * specifically, before bumping seq_cnt again.  A delay is
 110                  * needed on three fronts:
 111                  *
 112                  *  1. To allow sched_setaffinity() to prompt migration before
 113                  *     ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME
 114                  *     (or TIF_NEED_RESCHED, which indirectly leads to handling
 115                  *     NOTIFY_RESUME) is handled in KVM context.
 116                  *
 117                  *     If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters
 118                  *     the guest, the guest will trigger a IO/MMIO exit all the
 119                  *     way to userspace and the TIF flags will be handled by
 120                  *     the generic "exit to userspace" logic, not by KVM.  The
 121                  *     exit to userspace is necessary to give the test a chance
 122                  *     to check the rseq CPU ID (see #2).
 123                  *
 124                  *     Alternatively, guest_code() could include an instruction
 125                  *     to trigger an exit that is handled by KVM, but any such
 126                  *     exit requires architecture specific code.
 127                  *
 128                  *  2. To let ioctl(KVM_RUN) make its way back to the test
 129                  *     before the next round of migration.  The test's check on
 130                  *     the rseq CPU ID must wait for migration to complete in
 131                  *     order to avoid false positive, thus any kernel rseq bug
 132                  *     will be missed if the next migration starts before the
 133                  *     check completes.
 134                  *
 135                  *  3. To ensure the read-side makes efficient forward progress,
 136                  *     e.g. if getcpu() involves a syscall. Stalling the read-side
 137                  *     means the test will spend more time waiting for getcpu()
 138                  *     to stabilize and less time trying to hit the timing-dependent
 139                  *     bug.
 140                  *
 141                  * Because any bug in this area is likely to be timing-dependent,
 142                  * run with a range of delays at 1us intervals from 1us to 10us
 143                  * as a best effort to avoid tuning the test to the point where
 144                  * it can hit _only_ the original bug and not detect future
 145                  * regressions.
 146                  *
 147                  * The original bug can reproduce with a delay up to ~500us on
 148                  * x86-64, but starts to require more iterations to reproduce
 149                  * as the delay creeps above ~10us, and the average runtime of
 150                  * each iteration obviously increases as well.  Cap the delay
 151                  * at 10us to keep test runtime reasonable while minimizing
 152                  * potential coverage loss.
 153                  *
 154                  * The lower bound for reproducing the bug is likely below 1us,
 155                  * e.g. failures occur on x86-64 with nanosleep(0), but at that
 156                  * point the overhead of the syscall likely dominates the delay.
 157                  * Use usleep() for simplicity and to avoid unnecessary kernel
 158                  * dependencies.
 159                  */
 160                 usleep((i % 10) + 1);
 161         }
 162         done = true;
 163         return NULL;
 164 }
 165
 166 static void calc_min_max_cpu(void)
 167 {
 168         int i, cnt, nproc;
 169
 170         TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2);
 171
 172         /*
 173          * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that
 174          * this task is affined to in order to reduce the time spent querying
 175          * unusable CPUs, e.g. if this task is pinned to a small percentage of
 176          * total CPUs.
 177          */
 178         nproc = get_nprocs_conf();
 179         min_cpu = -1;
 180         max_cpu = -1;
 181         cnt = 0;
 182
 183         for (i = 0; i < nproc; i++) {
 184                 if (!CPU_ISSET(i, &possible_mask))
 185                         continue;
 186                 if (min_cpu == -1)
 187                         min_cpu = i;
 188                 max_cpu = i;
 189                 cnt++;
 190         }
 191
 192         __TEST_REQUIRE(cnt >= 2,
 193                        "Only one usable CPU, task migration not possible");
 194 }
 195
 196 static void help(const char *name)
 197 {
 198         puts("");
 199         printf("usage: %s [-h] [-u]\n", name);
 200         printf(" -u: Don't sanity check the number of successful KVM_RUNs\n");
 201         puts("");
 202         exit(0);
 203 }
 204
 205 int main(int argc, char *argv[])
 206 {
 207         bool skip_sanity_check = false;
 208         int r, i, snapshot;
 209         struct kvm_vm *vm;
 210         struct kvm_vcpu *vcpu;
 211         u32 cpu, rseq_cpu;
 212         int opt;
 213
 214         while ((opt = getopt(argc, argv, "hu")) != -1) {
 215                 switch (opt) {
 216                 case 'u':
 217                         skip_sanity_check = true;
 218                         break;
 219                 case 'h':
 220                 default:
 221                         help(argv[0]);
 222                         break;
 223                 }
 224         }
 225
 226         r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
 227         TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
 228                     strerror(errno));
 229
 230         calc_min_max_cpu();
 231
 232         r = rseq_register_current_thread();
 233         TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)",
 234                     errno, strerror(errno));
 235
 236         /*
 237          * Create and run a dummy VM that immediately exits to userspace via
 238          * GUEST_SYNC, while concurrently migrating the process by setting its
 239          * CPU affinity.
 240          */
 241         vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 242
 243         pthread_create(&migration_thread, NULL, migration_worker,
 244                        (void *)(unsigned long)syscall(SYS_gettid));
 245
 246         for (i = 0; !done; i++) {
 247                 vcpu_run(vcpu);
 248                 TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
 249                             "Guest failed?");
 250
 251                 /*
 252                  * Verify rseq's CPU matches sched's CPU.  Ensure migration
 253                  * doesn't occur between getcpu() and reading the rseq cpu_id
 254                  * by rereading both if the sequence count changes, or if the
 255                  * count is odd (migration in-progress).
 256                  */
 257                 do {
 258                         /*
 259                          * Drop bit 0 to force a mismatch if the count is odd,
 260                          * i.e. if a migration is in-progress.
 261                          */
 262                         snapshot = atomic_read(&seq_cnt) & ~1;
 263
 264                         /*
 265                          * Ensure calling getcpu() and reading rseq.cpu_id complete
 266                          * in a single "no migration" window, i.e. are not reordered
 267                          * across the seq_cnt reads.
 268                          */
 269                         smp_rmb();
 270                         r = sys_getcpu(&cpu, NULL);
 271                         TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)",
 272                                     errno, strerror(errno));
 273                         rseq_cpu = rseq_current_cpu_raw();
 274                         smp_rmb();
 275                 } while (snapshot != atomic_read(&seq_cnt));
 276
 277                 TEST_ASSERT(rseq_cpu == cpu,
 278                             "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu);
 279         }
 280
 281         /*
 282          * Sanity check that the test was able to enter the guest a reasonable
 283          * number of times, e.g. didn't get stalled too often/long waiting for
 284          * getcpu() to stabilize.  A 2:1 migration:KVM_RUN ratio is a fairly
 285          * conservative ratio on x86-64, which can do _more_ KVM_RUNs than
 286          * migrations given the 1us+ delay in the migration task.
 287          *
 288          * Another reason why it may have small migration:KVM_RUN ratio is that,
 289          * on systems with large low power mode wakeup latency, it may happen
 290          * quite often that the scheduler is not able to wake up the target CPU
 291          * before the vCPU thread is scheduled to another CPU.
 292          */
 293         TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2),
 294                     "Only performed %d KVM_RUNs, task stalled too much?\n\n"
 295                     "  Try disabling deep sleep states to reduce CPU wakeup latency,\n"
 296                     "  e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n"
 297                     "  or run with -u to disable this sanity check.", i);
 298
 299         pthread_join(migration_thread, NULL);
 300
 301         kvm_vm_free(vm);
 302
 303         rseq_unregister_current_thread();
 304
 305         return 0;
 306 }