drivers/ptp/ptp_vmclock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Virtual PTP 1588 clock for use with LM-safe VMclock device.
   4  *
   5  * Copyright © 2024 Amazon.com, Inc. or its affiliates.
   6  */
   7
   8 #include <linux/acpi.h>
   9 #include <linux/device.h>
  10 #include <linux/err.h>
  11 #include <linux/file.h>
  12 #include <linux/fs.h>
  13 #include <linux/init.h>
  14 #include <linux/kernel.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/platform_device.h>
  19 #include <linux/slab.h>
  20
  21 #include <uapi/linux/vmclock-abi.h>
  22
  23 #include <linux/ptp_clock_kernel.h>
  24
  25 #ifdef CONFIG_X86
  26 #include <asm/pvclock.h>
  27 #include <asm/kvmclock.h>
  28 #endif
  29
  30 #ifdef CONFIG_KVM_GUEST
  31 #define SUPPORT_KVMCLOCK
  32 #endif
  33
  34 static DEFINE_IDA(vmclock_ida);
  35
  36 ACPI_MODULE_NAME("vmclock");
  37
  38 struct vmclock_state {
  39         struct resource res;
  40         struct vmclock_abi *clk;
  41         struct miscdevice miscdev;
  42         struct ptp_clock_info ptp_clock_info;
  43         struct ptp_clock *ptp_clock;
  44         enum clocksource_ids cs_id, sys_cs_id;
  45         int index;
  46         char *name;
  47 };
  48
  49 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
  50
  51 /* Require at least the flags field to be present. All else can be optional. */
  52 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
  53
  54 #define VMCLOCK_FIELD_PRESENT(_c, _f)                     \
  55         (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
  56                                      sizeof((_c)->_f)))
  57
  58 /*
  59  * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
  60  * and add the fractional second part of the reference time.
  61  *
  62  * The result is a 128-bit value, the top 64 bits of which are seconds, and
  63  * the low 64 bits are (seconds >> 64).
  64  */
  65 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
  66                                         uint64_t period, uint8_t shift,
  67                                         uint64_t frac_sec)
  68 {
  69         unsigned __int128 res = (unsigned __int128)delta * period;
  70
  71         res >>= shift;
  72         res += frac_sec;
  73         *res_hi = res >> 64;
  74         return (uint64_t)res;
  75 }
  76
  77 static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
  78 {
  79         if (likely(clk->time_type == VMCLOCK_TIME_UTC))
  80                 return true;
  81
  82         if (clk->time_type == VMCLOCK_TIME_TAI &&
  83             (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
  84                 if (sec)
  85                         *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
  86                 return true;
  87         }
  88         return false;
  89 }
  90
  91 static int vmclock_get_crosststamp(struct vmclock_state *st,
  92                                    struct ptp_system_timestamp *sts,
  93                                    struct system_counterval_t *system_counter,
  94                                    struct timespec64 *tspec)
  95 {
  96         ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
  97         struct system_time_snapshot systime_snapshot;
  98         uint64_t cycle, delta, seq, frac_sec;
  99
 100 #ifdef CONFIG_X86
 101         /*
 102          * We'd expect the hypervisor to know this and to report the clock
 103          * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
 104          */
 105         if (check_tsc_unstable())
 106                 return -EINVAL;
 107 #endif
 108
 109         while (1) {
 110                 seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
 111
 112                 /*
 113                  * This pairs with a write barrier in the hypervisor
 114                  * which populates this structure.
 115                  */
 116                 virt_rmb();
 117
 118                 if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
 119                         return -EINVAL;
 120
 121                 /*
 122                  * When invoked for gettimex64(), fill in the pre/post system
 123                  * times. The simple case is when system time is based on the
 124                  * same counter as st->cs_id, in which case all three times
 125                  * will be derived from the *same* counter value.
 126                  *
 127                  * If the system isn't using the same counter, then the value
 128                  * from ktime_get_snapshot() will still be used as pre_ts, and
 129                  * ptp_read_system_postts() is called to populate postts after
 130                  * calling get_cycles().
 131                  *
 132                  * The conversion to timespec64 happens further down, outside
 133                  * the seq_count loop.
 134                  */
 135                 if (sts) {
 136                         ktime_get_snapshot(&systime_snapshot);
 137                         if (systime_snapshot.cs_id == st->cs_id) {
 138                                 cycle = systime_snapshot.cycles;
 139                         } else {
 140                                 cycle = get_cycles();
 141                                 ptp_read_system_postts(sts);
 142                         }
 143                 } else {
 144                         cycle = get_cycles();
 145                 }
 146
 147                 delta = cycle - le64_to_cpu(st->clk->counter_value);
 148
 149                 frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
 150                                                    le64_to_cpu(st->clk->counter_period_frac_sec),
 151                                                    st->clk->counter_period_shift,
 152                                                    le64_to_cpu(st->clk->time_frac_sec));
 153                 tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
 154                 tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
 155
 156                 if (!tai_adjust(st->clk, &tspec->tv_sec))
 157                         return -EINVAL;
 158
 159                 /*
 160                  * This pairs with a write barrier in the hypervisor
 161                  * which populates this structure.
 162                  */
 163                 virt_rmb();
 164                 if (seq == le32_to_cpu(st->clk->seq_count))
 165                         break;
 166
 167                 if (ktime_after(ktime_get(), deadline))
 168                         return -ETIMEDOUT;
 169         }
 170
 171         if (system_counter) {
 172                 system_counter->cycles = cycle;
 173                 system_counter->cs_id = st->cs_id;
 174         }
 175
 176         if (sts) {
 177                 sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
 178                 if (systime_snapshot.cs_id == st->cs_id)
 179                         sts->post_ts = sts->pre_ts;
 180         }
 181
 182         return 0;
 183 }
 184
 185 #ifdef SUPPORT_KVMCLOCK
 186 /*
 187  * In the case where the system is using the KVM clock for timekeeping, convert
 188  * the TSC value into a KVM clock time in order to return a paired reading that
 189  * get_device_system_crosststamp() can cope with.
 190  */
 191 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
 192                                             struct ptp_system_timestamp *sts,
 193                                             struct system_counterval_t *system_counter,
 194                                             struct timespec64 *tspec)
 195 {
 196         struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
 197         unsigned int pvti_ver;
 198         int ret;
 199
 200         preempt_disable_notrace();
 201
 202         do {
 203                 pvti_ver = pvclock_read_begin(pvti);
 204
 205                 ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
 206                 if (ret)
 207                         break;
 208
 209                 system_counter->cycles = __pvclock_read_cycles(pvti,
 210                                                                system_counter->cycles);
 211                 system_counter->cs_id = CSID_X86_KVM_CLK;
 212
 213                 /*
 214                  * This retry should never really happen; if the TSC is
 215                  * stable and reliable enough across vCPUS that it is sane
 216                  * for the hypervisor to expose a VMCLOCK device which uses
 217                  * it as the reference counter, then the KVM clock sohuld be
 218                  * in 'master clock mode' and basically never changed. But
 219                  * the KVM clock is a fickle and often broken thing, so do
 220                  * it "properly" just in case.
 221                  */
 222         } while (pvclock_read_retry(pvti, pvti_ver));
 223
 224         preempt_enable_notrace();
 225
 226         return ret;
 227 }
 228 #endif
 229
 230 static int ptp_vmclock_get_time_fn(ktime_t *device_time,
 231                                    struct system_counterval_t *system_counter,
 232                                    void *ctx)
 233 {
 234         struct vmclock_state *st = ctx;
 235         struct timespec64 tspec;
 236         int ret;
 237
 238 #ifdef SUPPORT_KVMCLOCK
 239         if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
 240                 ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
 241                                                        &tspec);
 242         else
 243 #endif
 244                 ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
 245
 246         if (!ret)
 247                 *device_time = timespec64_to_ktime(tspec);
 248
 249         return ret;
 250 }
 251
 252 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
 253                                       struct system_device_crosststamp *xtstamp)
 254 {
 255         struct vmclock_state *st = container_of(ptp, struct vmclock_state,
 256                                                 ptp_clock_info);
 257         int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
 258                                                 NULL, xtstamp);
 259 #ifdef SUPPORT_KVMCLOCK
 260         /*
 261          * On x86, the KVM clock may be used for the system time. We can
 262          * actually convert a TSC reading to that, and return a paired
 263          * timestamp that get_device_system_crosststamp() *can* handle.
 264          */
 265         if (ret == -ENODEV) {
 266                 struct system_time_snapshot systime_snapshot;
 267
 268                 ktime_get_snapshot(&systime_snapshot);
 269
 270                 if (systime_snapshot.cs_id == CSID_X86_TSC ||
 271                     systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
 272                         WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
 273                         ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
 274                                                             st, NULL, xtstamp);
 275                 }
 276         }
 277 #endif
 278         return ret;
 279 }
 280
 281 /*
 282  * PTP clock operations
 283  */
 284
 285 static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
 286 {
 287         return -EOPNOTSUPP;
 288 }
 289
 290 static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
 291 {
 292         return -EOPNOTSUPP;
 293 }
 294
 295 static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
 296                            const struct timespec64 *ts)
 297 {
 298         return -EOPNOTSUPP;
 299 }
 300
 301 static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
 302                                 struct ptp_system_timestamp *sts)
 303 {
 304         struct vmclock_state *st = container_of(ptp, struct vmclock_state,
 305                                                 ptp_clock_info);
 306
 307         return vmclock_get_crosststamp(st, sts, NULL, ts);
 308 }
 309
 310 static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
 311                           struct ptp_clock_request *rq, int on)
 312 {
 313         return -EOPNOTSUPP;
 314 }
 315
 316 static const struct ptp_clock_info ptp_vmclock_info = {
 317         .owner          = THIS_MODULE,
 318         .max_adj        = 0,
 319         .n_ext_ts       = 0,
 320         .n_pins         = 0,
 321         .pps            = 0,
 322         .adjfine        = ptp_vmclock_adjfine,
 323         .adjtime        = ptp_vmclock_adjtime,
 324         .gettimex64     = ptp_vmclock_gettimex,
 325         .settime64      = ptp_vmclock_settime,
 326         .enable         = ptp_vmclock_enable,
 327         .getcrosststamp = ptp_vmclock_getcrosststamp,
 328 };
 329
 330 static struct ptp_clock *vmclock_ptp_register(struct device *dev,
 331                                               struct vmclock_state *st)
 332 {
 333         enum clocksource_ids cs_id;
 334
 335         if (IS_ENABLED(CONFIG_ARM64) &&
 336             st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
 337                 /* Can we check it's the virtual counter? */
 338                 cs_id = CSID_ARM_ARCH_COUNTER;
 339         } else if (IS_ENABLED(CONFIG_X86) &&
 340                    st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
 341                 cs_id = CSID_X86_TSC;
 342         } else {
 343                 return NULL;
 344         }
 345
 346         /* Only UTC, or TAI with offset */
 347         if (!tai_adjust(st->clk, NULL)) {
 348                 dev_info(dev, "vmclock does not provide unambiguous UTC\n");
 349                 return NULL;
 350         }
 351
 352         st->sys_cs_id = cs_id;
 353         st->cs_id = cs_id;
 354         st->ptp_clock_info = ptp_vmclock_info;
 355         strscpy(st->ptp_clock_info.name, st->name);
 356
 357         return ptp_clock_register(&st->ptp_clock_info, dev);
 358 }
 359
 360 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
 361 {
 362         struct vmclock_state *st = container_of(fp->private_data,
 363                                                 struct vmclock_state, miscdev);
 364
 365         if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
 366                 return -EROFS;
 367
 368         if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
 369                 return -EINVAL;
 370
 371         if (io_remap_pfn_range(vma, vma->vm_start,
 372                                st->res.start >> PAGE_SHIFT, PAGE_SIZE,
 373                                vma->vm_page_prot))
 374                 return -EAGAIN;
 375
 376         return 0;
 377 }
 378
 379 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 380                                     size_t count, loff_t *ppos)
 381 {
 382         struct vmclock_state *st = container_of(fp->private_data,
 383                                                 struct vmclock_state, miscdev);
 384         ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
 385         size_t max_count;
 386         uint32_t seq;
 387
 388         if (*ppos >= PAGE_SIZE)
 389                 return 0;
 390
 391         max_count = PAGE_SIZE - *ppos;
 392         if (count > max_count)
 393                 count = max_count;
 394
 395         while (1) {
 396                 seq = le32_to_cpu(st->clk->seq_count) & ~1U;
 397                 /* Pairs with hypervisor wmb */
 398                 virt_rmb();
 399
 400                 if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
 401                         return -EFAULT;
 402
 403                 /* Pairs with hypervisor wmb */
 404                 virt_rmb();
 405                 if (seq == le32_to_cpu(st->clk->seq_count))
 406                         break;
 407
 408                 if (ktime_after(ktime_get(), deadline))
 409                         return -ETIMEDOUT;
 410         }
 411
 412         *ppos += count;
 413         return count;
 414 }
 415
 416 static const struct file_operations vmclock_miscdev_fops = {
 417         .mmap = vmclock_miscdev_mmap,
 418         .read = vmclock_miscdev_read,
 419 };
 420
 421 /* module operations */
 422
 423 static void vmclock_remove(struct platform_device *pdev)
 424 {
 425         struct device *dev = &pdev->dev;
 426         struct vmclock_state *st = dev_get_drvdata(dev);
 427
 428         if (st->ptp_clock)
 429                 ptp_clock_unregister(st->ptp_clock);
 430
 431         if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
 432                 misc_deregister(&st->miscdev);
 433 }
 434
 435 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
 436 {
 437         struct vmclock_state *st = data;
 438         struct resource_win win;
 439         struct resource *res = &win.res;
 440
 441         if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
 442                 return AE_OK;
 443
 444         /* There can be only one */
 445         if (resource_type(&st->res) == IORESOURCE_MEM)
 446                 return AE_ERROR;
 447
 448         if (acpi_dev_resource_memory(ares, res) ||
 449             acpi_dev_resource_address_space(ares, &win)) {
 450
 451                 if (resource_type(res) != IORESOURCE_MEM ||
 452                     resource_size(res) < sizeof(st->clk))
 453                         return AE_ERROR;
 454
 455                 st->res = *res;
 456                 return AE_OK;
 457         }
 458
 459         return AE_ERROR;
 460 }
 461
 462 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
 463 {
 464         struct acpi_device *adev = ACPI_COMPANION(dev);
 465         acpi_status status;
 466
 467         /*
 468          * This should never happen as this function is only called when
 469          * has_acpi_companion(dev) is true, but the logic is sufficiently
 470          * complex that Coverity can't see the tautology.
 471          */
 472         if (!adev)
 473                 return -ENODEV;
 474
 475         status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
 476                                      vmclock_acpi_resources, st);
 477         if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
 478                 dev_err(dev, "failed to get resources\n");
 479                 return -ENODEV;
 480         }
 481
 482         return 0;
 483 }
 484
 485 static void vmclock_put_idx(void *data)
 486 {
 487         struct vmclock_state *st = data;
 488
 489         ida_free(&vmclock_ida, st->index);
 490 }
 491
 492 static int vmclock_probe(struct platform_device *pdev)
 493 {
 494         struct device *dev = &pdev->dev;
 495         struct vmclock_state *st;
 496         int ret;
 497
 498         st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
 499         if (!st)
 500                 return -ENOMEM;
 501
 502         if (has_acpi_companion(dev))
 503                 ret = vmclock_probe_acpi(dev, st);
 504         else
 505                 ret = -EINVAL; /* Only ACPI for now */
 506
 507         if (ret) {
 508                 dev_info(dev, "Failed to obtain physical address: %d\n", ret);
 509                 goto out;
 510         }
 511
 512         if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
 513                 dev_info(dev, "Region too small (0x%llx)\n",
 514                          resource_size(&st->res));
 515                 ret = -EINVAL;
 516                 goto out;
 517         }
 518         st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
 519                                 MEMREMAP_WB | MEMREMAP_DEC);
 520         if (IS_ERR(st->clk)) {
 521                 ret = PTR_ERR(st->clk);
 522                 dev_info(dev, "failed to map shared memory\n");
 523                 st->clk = NULL;
 524                 goto out;
 525         }
 526
 527         if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
 528             le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
 529             le16_to_cpu(st->clk->version) != 1) {
 530                 dev_info(dev, "vmclock magic fields invalid\n");
 531                 ret = -EINVAL;
 532                 goto out;
 533         }
 534
 535         ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
 536         if (ret < 0)
 537                 goto out;
 538
 539         st->index = ret;
 540         ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
 541         if (ret)
 542                 goto out;
 543
 544         st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
 545         if (!st->name) {
 546                 ret = -ENOMEM;
 547                 goto out;
 548         }
 549
 550         /*
 551          * If the structure is big enough, it can be mapped to userspace.
 552          * Theoretically a guest OS even using larger pages could still
 553          * use 4KiB PTEs to map smaller MMIO regions like this, but let's
 554          * cross that bridge if/when we come to it.
 555          */
 556         if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
 557                 st->miscdev.minor = MISC_DYNAMIC_MINOR;
 558                 st->miscdev.fops = &vmclock_miscdev_fops;
 559                 st->miscdev.name = st->name;
 560
 561                 ret = misc_register(&st->miscdev);
 562                 if (ret)
 563                         goto out;
 564         }
 565
 566         /* If there is valid clock information, register a PTP clock */
 567         if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
 568                 /* Can return a silent NULL, or an error. */
 569                 st->ptp_clock = vmclock_ptp_register(dev, st);
 570                 if (IS_ERR(st->ptp_clock)) {
 571                         ret = PTR_ERR(st->ptp_clock);
 572                         st->ptp_clock = NULL;
 573                         vmclock_remove(pdev);
 574                         goto out;
 575                 }
 576         }
 577
 578         if (!st->miscdev.minor && !st->ptp_clock) {
 579                 /* Neither miscdev nor PTP registered */
 580                 dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
 581                 ret = -ENODEV;
 582                 goto out;
 583         }
 584
 585         dev_info(dev, "%s: registered %s%s%s\n", st->name,
 586                  st->miscdev.minor ? "miscdev" : "",
 587                  (st->miscdev.minor && st->ptp_clock) ? ", " : "",
 588                  st->ptp_clock ? "PTP" : "");
 589
 590         dev_set_drvdata(dev, st);
 591
 592  out:
 593         return ret;
 594 }
 595
 596 static const struct acpi_device_id vmclock_acpi_ids[] = {
 597         { "AMZNC10C", 0 },
 598         {}
 599 };
 600 MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
 601
 602 static struct platform_driver vmclock_platform_driver = {
 603         .probe          = vmclock_probe,
 604         .remove_new     = vmclock_remove,
 605         .driver = {
 606                 .name   = "vmclock",
 607                 .acpi_match_table = vmclock_acpi_ids,
 608         },
 609 };
 610
 611 module_platform_driver(vmclock_platform_driver)
 612
 613 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
 614 MODULE_DESCRIPTION("PTP clock using VMCLOCK");
 615 MODULE_LICENSE("GPL");