1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Virtual PTP 1588 clock for use with LM-safe VMclock device.
5 * Copyright © 2024 Amazon.com, Inc. or its affiliates.
8 #include <linux/acpi.h>
9 #include <linux/device.h>
10 #include <linux/err.h>
11 #include <linux/file.h>
13 #include <linux/init.h>
14 #include <linux/kernel.h>
15 #include <linux/miscdevice.h>
17 #include <linux/module.h>
18 #include <linux/platform_device.h>
19 #include <linux/slab.h>
21 #include <uapi/linux/vmclock-abi.h>
23 #include <linux/ptp_clock_kernel.h>
26 #include <asm/pvclock.h>
27 #include <asm/kvmclock.h>
30 #ifdef CONFIG_KVM_GUEST
31 #define SUPPORT_KVMCLOCK
34 static DEFINE_IDA(vmclock_ida
);
36 ACPI_MODULE_NAME("vmclock");
38 struct vmclock_state
{
40 struct vmclock_abi
*clk
;
41 struct miscdevice miscdev
;
42 struct ptp_clock_info ptp_clock_info
;
43 struct ptp_clock
*ptp_clock
;
44 enum clocksource_ids cs_id
, sys_cs_id
;
49 #define VMCLOCK_MAX_WAIT ms_to_ktime(100)
51 /* Require at least the flags field to be present. All else can be optional. */
52 #define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
54 #define VMCLOCK_FIELD_PRESENT(_c, _f) \
55 (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
59 * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
60 * and add the fractional second part of the reference time.
62 * The result is a 128-bit value, the top 64 bits of which are seconds, and
63 * the low 64 bits are (seconds >> 64).
65 static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi
, uint64_t delta
,
66 uint64_t period
, uint8_t shift
,
69 unsigned __int128 res
= (unsigned __int128
)delta
* period
;
77 static bool tai_adjust(struct vmclock_abi
*clk
, uint64_t *sec
)
79 if (likely(clk
->time_type
== VMCLOCK_TIME_UTC
))
82 if (clk
->time_type
== VMCLOCK_TIME_TAI
&&
83 (le64_to_cpu(clk
->flags
) & VMCLOCK_FLAG_TAI_OFFSET_VALID
)) {
85 *sec
+= (int16_t)le16_to_cpu(clk
->tai_offset_sec
);
91 static int vmclock_get_crosststamp(struct vmclock_state
*st
,
92 struct ptp_system_timestamp
*sts
,
93 struct system_counterval_t
*system_counter
,
94 struct timespec64
*tspec
)
96 ktime_t deadline
= ktime_add(ktime_get(), VMCLOCK_MAX_WAIT
);
97 struct system_time_snapshot systime_snapshot
;
98 uint64_t cycle
, delta
, seq
, frac_sec
;
102 * We'd expect the hypervisor to know this and to report the clock
103 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
105 if (check_tsc_unstable())
110 seq
= le32_to_cpu(st
->clk
->seq_count
) & ~1ULL;
113 * This pairs with a write barrier in the hypervisor
114 * which populates this structure.
118 if (st
->clk
->clock_status
== VMCLOCK_STATUS_UNRELIABLE
)
122 * When invoked for gettimex64(), fill in the pre/post system
123 * times. The simple case is when system time is based on the
124 * same counter as st->cs_id, in which case all three times
125 * will be derived from the *same* counter value.
127 * If the system isn't using the same counter, then the value
128 * from ktime_get_snapshot() will still be used as pre_ts, and
129 * ptp_read_system_postts() is called to populate postts after
130 * calling get_cycles().
132 * The conversion to timespec64 happens further down, outside
133 * the seq_count loop.
136 ktime_get_snapshot(&systime_snapshot
);
137 if (systime_snapshot
.cs_id
== st
->cs_id
) {
138 cycle
= systime_snapshot
.cycles
;
140 cycle
= get_cycles();
141 ptp_read_system_postts(sts
);
144 cycle
= get_cycles();
147 delta
= cycle
- le64_to_cpu(st
->clk
->counter_value
);
149 frac_sec
= mul_u64_u64_shr_add_u64(&tspec
->tv_sec
, delta
,
150 le64_to_cpu(st
->clk
->counter_period_frac_sec
),
151 st
->clk
->counter_period_shift
,
152 le64_to_cpu(st
->clk
->time_frac_sec
));
153 tspec
->tv_nsec
= mul_u64_u64_shr(frac_sec
, NSEC_PER_SEC
, 64);
154 tspec
->tv_sec
+= le64_to_cpu(st
->clk
->time_sec
);
156 if (!tai_adjust(st
->clk
, &tspec
->tv_sec
))
160 * This pairs with a write barrier in the hypervisor
161 * which populates this structure.
164 if (seq
== le32_to_cpu(st
->clk
->seq_count
))
167 if (ktime_after(ktime_get(), deadline
))
171 if (system_counter
) {
172 system_counter
->cycles
= cycle
;
173 system_counter
->cs_id
= st
->cs_id
;
177 sts
->pre_ts
= ktime_to_timespec64(systime_snapshot
.real
);
178 if (systime_snapshot
.cs_id
== st
->cs_id
)
179 sts
->post_ts
= sts
->pre_ts
;
185 #ifdef SUPPORT_KVMCLOCK
187 * In the case where the system is using the KVM clock for timekeeping, convert
188 * the TSC value into a KVM clock time in order to return a paired reading that
189 * get_device_system_crosststamp() can cope with.
191 static int vmclock_get_crosststamp_kvmclock(struct vmclock_state
*st
,
192 struct ptp_system_timestamp
*sts
,
193 struct system_counterval_t
*system_counter
,
194 struct timespec64
*tspec
)
196 struct pvclock_vcpu_time_info
*pvti
= this_cpu_pvti();
197 unsigned int pvti_ver
;
200 preempt_disable_notrace();
203 pvti_ver
= pvclock_read_begin(pvti
);
205 ret
= vmclock_get_crosststamp(st
, sts
, system_counter
, tspec
);
209 system_counter
->cycles
= __pvclock_read_cycles(pvti
,
210 system_counter
->cycles
);
211 system_counter
->cs_id
= CSID_X86_KVM_CLK
;
214 * This retry should never really happen; if the TSC is
215 * stable and reliable enough across vCPUS that it is sane
216 * for the hypervisor to expose a VMCLOCK device which uses
217 * it as the reference counter, then the KVM clock sohuld be
218 * in 'master clock mode' and basically never changed. But
219 * the KVM clock is a fickle and often broken thing, so do
220 * it "properly" just in case.
222 } while (pvclock_read_retry(pvti
, pvti_ver
));
224 preempt_enable_notrace();
230 static int ptp_vmclock_get_time_fn(ktime_t
*device_time
,
231 struct system_counterval_t
*system_counter
,
234 struct vmclock_state
*st
= ctx
;
235 struct timespec64 tspec
;
238 #ifdef SUPPORT_KVMCLOCK
239 if (READ_ONCE(st
->sys_cs_id
) == CSID_X86_KVM_CLK
)
240 ret
= vmclock_get_crosststamp_kvmclock(st
, NULL
, system_counter
,
244 ret
= vmclock_get_crosststamp(st
, NULL
, system_counter
, &tspec
);
247 *device_time
= timespec64_to_ktime(tspec
);
252 static int ptp_vmclock_getcrosststamp(struct ptp_clock_info
*ptp
,
253 struct system_device_crosststamp
*xtstamp
)
255 struct vmclock_state
*st
= container_of(ptp
, struct vmclock_state
,
257 int ret
= get_device_system_crosststamp(ptp_vmclock_get_time_fn
, st
,
259 #ifdef SUPPORT_KVMCLOCK
261 * On x86, the KVM clock may be used for the system time. We can
262 * actually convert a TSC reading to that, and return a paired
263 * timestamp that get_device_system_crosststamp() *can* handle.
265 if (ret
== -ENODEV
) {
266 struct system_time_snapshot systime_snapshot
;
268 ktime_get_snapshot(&systime_snapshot
);
270 if (systime_snapshot
.cs_id
== CSID_X86_TSC
||
271 systime_snapshot
.cs_id
== CSID_X86_KVM_CLK
) {
272 WRITE_ONCE(st
->sys_cs_id
, systime_snapshot
.cs_id
);
273 ret
= get_device_system_crosststamp(ptp_vmclock_get_time_fn
,
282 * PTP clock operations
285 static int ptp_vmclock_adjfine(struct ptp_clock_info
*ptp
, long delta
)
290 static int ptp_vmclock_adjtime(struct ptp_clock_info
*ptp
, s64 delta
)
295 static int ptp_vmclock_settime(struct ptp_clock_info
*ptp
,
296 const struct timespec64
*ts
)
301 static int ptp_vmclock_gettimex(struct ptp_clock_info
*ptp
, struct timespec64
*ts
,
302 struct ptp_system_timestamp
*sts
)
304 struct vmclock_state
*st
= container_of(ptp
, struct vmclock_state
,
307 return vmclock_get_crosststamp(st
, sts
, NULL
, ts
);
310 static int ptp_vmclock_enable(struct ptp_clock_info
*ptp
,
311 struct ptp_clock_request
*rq
, int on
)
316 static const struct ptp_clock_info ptp_vmclock_info
= {
317 .owner
= THIS_MODULE
,
322 .adjfine
= ptp_vmclock_adjfine
,
323 .adjtime
= ptp_vmclock_adjtime
,
324 .gettimex64
= ptp_vmclock_gettimex
,
325 .settime64
= ptp_vmclock_settime
,
326 .enable
= ptp_vmclock_enable
,
327 .getcrosststamp
= ptp_vmclock_getcrosststamp
,
330 static struct ptp_clock
*vmclock_ptp_register(struct device
*dev
,
331 struct vmclock_state
*st
)
333 enum clocksource_ids cs_id
;
335 if (IS_ENABLED(CONFIG_ARM64
) &&
336 st
->clk
->counter_id
== VMCLOCK_COUNTER_ARM_VCNT
) {
337 /* Can we check it's the virtual counter? */
338 cs_id
= CSID_ARM_ARCH_COUNTER
;
339 } else if (IS_ENABLED(CONFIG_X86
) &&
340 st
->clk
->counter_id
== VMCLOCK_COUNTER_X86_TSC
) {
341 cs_id
= CSID_X86_TSC
;
346 /* Only UTC, or TAI with offset */
347 if (!tai_adjust(st
->clk
, NULL
)) {
348 dev_info(dev
, "vmclock does not provide unambiguous UTC\n");
352 st
->sys_cs_id
= cs_id
;
354 st
->ptp_clock_info
= ptp_vmclock_info
;
355 strscpy(st
->ptp_clock_info
.name
, st
->name
);
357 return ptp_clock_register(&st
->ptp_clock_info
, dev
);
360 static int vmclock_miscdev_mmap(struct file
*fp
, struct vm_area_struct
*vma
)
362 struct vmclock_state
*st
= container_of(fp
->private_data
,
363 struct vmclock_state
, miscdev
);
365 if ((vma
->vm_flags
& (VM_READ
|VM_WRITE
)) != VM_READ
)
368 if (vma
->vm_end
- vma
->vm_start
!= PAGE_SIZE
|| vma
->vm_pgoff
)
371 if (io_remap_pfn_range(vma
, vma
->vm_start
,
372 st
->res
.start
>> PAGE_SHIFT
, PAGE_SIZE
,
379 static ssize_t
vmclock_miscdev_read(struct file
*fp
, char __user
*buf
,
380 size_t count
, loff_t
*ppos
)
382 struct vmclock_state
*st
= container_of(fp
->private_data
,
383 struct vmclock_state
, miscdev
);
384 ktime_t deadline
= ktime_add(ktime_get(), VMCLOCK_MAX_WAIT
);
388 if (*ppos
>= PAGE_SIZE
)
391 max_count
= PAGE_SIZE
- *ppos
;
392 if (count
> max_count
)
396 seq
= le32_to_cpu(st
->clk
->seq_count
) & ~1U;
397 /* Pairs with hypervisor wmb */
400 if (copy_to_user(buf
, ((char *)st
->clk
) + *ppos
, count
))
403 /* Pairs with hypervisor wmb */
405 if (seq
== le32_to_cpu(st
->clk
->seq_count
))
408 if (ktime_after(ktime_get(), deadline
))
416 static const struct file_operations vmclock_miscdev_fops
= {
417 .mmap
= vmclock_miscdev_mmap
,
418 .read
= vmclock_miscdev_read
,
421 /* module operations */
423 static void vmclock_remove(struct platform_device
*pdev
)
425 struct device
*dev
= &pdev
->dev
;
426 struct vmclock_state
*st
= dev_get_drvdata(dev
);
429 ptp_clock_unregister(st
->ptp_clock
);
431 if (st
->miscdev
.minor
!= MISC_DYNAMIC_MINOR
)
432 misc_deregister(&st
->miscdev
);
435 static acpi_status
vmclock_acpi_resources(struct acpi_resource
*ares
, void *data
)
437 struct vmclock_state
*st
= data
;
438 struct resource_win win
;
439 struct resource
*res
= &win
.res
;
441 if (ares
->type
== ACPI_RESOURCE_TYPE_END_TAG
)
444 /* There can be only one */
445 if (resource_type(&st
->res
) == IORESOURCE_MEM
)
448 if (acpi_dev_resource_memory(ares
, res
) ||
449 acpi_dev_resource_address_space(ares
, &win
)) {
451 if (resource_type(res
) != IORESOURCE_MEM
||
452 resource_size(res
) < sizeof(st
->clk
))
462 static int vmclock_probe_acpi(struct device
*dev
, struct vmclock_state
*st
)
464 struct acpi_device
*adev
= ACPI_COMPANION(dev
);
468 * This should never happen as this function is only called when
469 * has_acpi_companion(dev) is true, but the logic is sufficiently
470 * complex that Coverity can't see the tautology.
475 status
= acpi_walk_resources(adev
->handle
, METHOD_NAME__CRS
,
476 vmclock_acpi_resources
, st
);
477 if (ACPI_FAILURE(status
) || resource_type(&st
->res
) != IORESOURCE_MEM
) {
478 dev_err(dev
, "failed to get resources\n");
485 static void vmclock_put_idx(void *data
)
487 struct vmclock_state
*st
= data
;
489 ida_free(&vmclock_ida
, st
->index
);
492 static int vmclock_probe(struct platform_device
*pdev
)
494 struct device
*dev
= &pdev
->dev
;
495 struct vmclock_state
*st
;
498 st
= devm_kzalloc(dev
, sizeof(*st
), GFP_KERNEL
);
502 if (has_acpi_companion(dev
))
503 ret
= vmclock_probe_acpi(dev
, st
);
505 ret
= -EINVAL
; /* Only ACPI for now */
508 dev_info(dev
, "Failed to obtain physical address: %d\n", ret
);
512 if (resource_size(&st
->res
) < VMCLOCK_MIN_SIZE
) {
513 dev_info(dev
, "Region too small (0x%llx)\n",
514 resource_size(&st
->res
));
518 st
->clk
= devm_memremap(dev
, st
->res
.start
, resource_size(&st
->res
),
519 MEMREMAP_WB
| MEMREMAP_DEC
);
520 if (IS_ERR(st
->clk
)) {
521 ret
= PTR_ERR(st
->clk
);
522 dev_info(dev
, "failed to map shared memory\n");
527 if (le32_to_cpu(st
->clk
->magic
) != VMCLOCK_MAGIC
||
528 le32_to_cpu(st
->clk
->size
) > resource_size(&st
->res
) ||
529 le16_to_cpu(st
->clk
->version
) != 1) {
530 dev_info(dev
, "vmclock magic fields invalid\n");
535 ret
= ida_alloc(&vmclock_ida
, GFP_KERNEL
);
540 ret
= devm_add_action_or_reset(&pdev
->dev
, vmclock_put_idx
, st
);
544 st
->name
= devm_kasprintf(&pdev
->dev
, GFP_KERNEL
, "vmclock%d", st
->index
);
551 * If the structure is big enough, it can be mapped to userspace.
552 * Theoretically a guest OS even using larger pages could still
553 * use 4KiB PTEs to map smaller MMIO regions like this, but let's
554 * cross that bridge if/when we come to it.
556 if (le32_to_cpu(st
->clk
->size
) >= PAGE_SIZE
) {
557 st
->miscdev
.minor
= MISC_DYNAMIC_MINOR
;
558 st
->miscdev
.fops
= &vmclock_miscdev_fops
;
559 st
->miscdev
.name
= st
->name
;
561 ret
= misc_register(&st
->miscdev
);
566 /* If there is valid clock information, register a PTP clock */
567 if (VMCLOCK_FIELD_PRESENT(st
->clk
, time_frac_sec
)) {
568 /* Can return a silent NULL, or an error. */
569 st
->ptp_clock
= vmclock_ptp_register(dev
, st
);
570 if (IS_ERR(st
->ptp_clock
)) {
571 ret
= PTR_ERR(st
->ptp_clock
);
572 st
->ptp_clock
= NULL
;
573 vmclock_remove(pdev
);
578 if (!st
->miscdev
.minor
&& !st
->ptp_clock
) {
579 /* Neither miscdev nor PTP registered */
580 dev_info(dev
, "vmclock: Neither miscdev nor PTP available; not registering\n");
585 dev_info(dev
, "%s: registered %s%s%s\n", st
->name
,
586 st
->miscdev
.minor
? "miscdev" : "",
587 (st
->miscdev
.minor
&& st
->ptp_clock
) ? ", " : "",
588 st
->ptp_clock
? "PTP" : "");
590 dev_set_drvdata(dev
, st
);
596 static const struct acpi_device_id vmclock_acpi_ids
[] = {
600 MODULE_DEVICE_TABLE(acpi
, vmclock_acpi_ids
);
602 static struct platform_driver vmclock_platform_driver
= {
603 .probe
= vmclock_probe
,
604 .remove
= vmclock_remove
,
607 .acpi_match_table
= vmclock_acpi_ids
,
611 module_platform_driver(vmclock_platform_driver
)
613 MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
614 MODULE_DESCRIPTION("PTP clock using VMCLOCK");
615 MODULE_LICENSE("GPL");