1 // SPDX-License-Identifier: GPL-2.0
4 * Architecture neutral utility routines for interacting with
5 * Hyper-V. This file is specifically for code that must be
6 * built-in to the kernel image when CONFIG_HYPERV is set
7 * (vs. being in a module) because it is called from architecture
8 * specific code under arch/.
10 * Copyright (C) 2021, Microsoft, Inc.
12 * Author : Michael Kelley <mikelley@microsoft.com>
15 #include <linux/types.h>
16 #include <linux/acpi.h>
17 #include <linux/export.h>
18 #include <linux/bitfield.h>
19 #include <linux/cpumask.h>
20 #include <linux/sched/task_stack.h>
21 #include <linux/panic_notifier.h>
22 #include <linux/ptrace.h>
23 #include <linux/random.h>
24 #include <linux/efi.h>
25 #include <linux/kdebug.h>
26 #include <linux/kmsg_dump.h>
27 #include <linux/sizes.h>
28 #include <linux/slab.h>
29 #include <linux/dma-map-ops.h>
30 #include <linux/set_memory.h>
31 #include <asm/hyperv-tlfs.h>
32 #include <asm/mshyperv.h>
35 * hv_root_partition, ms_hyperv and hv_nested are defined here with other
36 * Hyper-V specific globals so they are shared across all architectures and are
37 * built only when CONFIG_HYPERV is defined. But on x86,
38 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
39 * defined, and it uses these three variables. So mark them as __weak
40 * here, allowing for an overriding definition in the module containing
41 * ms_hyperv_init_platform().
43 bool __weak hv_root_partition
;
44 EXPORT_SYMBOL_GPL(hv_root_partition
);
46 bool __weak hv_nested
;
47 EXPORT_SYMBOL_GPL(hv_nested
);
49 struct ms_hyperv_info __weak ms_hyperv
;
50 EXPORT_SYMBOL_GPL(ms_hyperv
);
53 EXPORT_SYMBOL_GPL(hv_vp_index
);
56 EXPORT_SYMBOL_GPL(hv_max_vp_index
);
58 void * __percpu
*hyperv_pcpu_input_arg
;
59 EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg
);
61 void * __percpu
*hyperv_pcpu_output_arg
;
62 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg
);
64 static void hv_kmsg_dump_unregister(void);
66 static struct ctl_table_header
*hv_ctl_table_hdr
;
69 * Hyper-V specific initialization and shutdown code that is
70 * common across all architectures. Called from architecture
71 * specific initialization functions.
74 void __init
hv_common_free(void)
76 unregister_sysctl_table(hv_ctl_table_hdr
);
77 hv_ctl_table_hdr
= NULL
;
79 if (ms_hyperv
.misc_features
& HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE
)
80 hv_kmsg_dump_unregister();
85 free_percpu(hyperv_pcpu_output_arg
);
86 hyperv_pcpu_output_arg
= NULL
;
88 free_percpu(hyperv_pcpu_input_arg
);
89 hyperv_pcpu_input_arg
= NULL
;
93 * Functions for allocating and freeing memory with size and
94 * alignment HV_HYP_PAGE_SIZE. These functions are needed because
95 * the guest page size may not be the same as the Hyper-V page
96 * size. We depend upon kmalloc() aligning power-of-two size
97 * allocations to the allocation size boundary, so that the
98 * allocated memory appears to Hyper-V as a page of the size
102 void *hv_alloc_hyperv_page(void)
104 BUILD_BUG_ON(PAGE_SIZE
< HV_HYP_PAGE_SIZE
);
106 if (PAGE_SIZE
== HV_HYP_PAGE_SIZE
)
107 return (void *)__get_free_page(GFP_KERNEL
);
109 return kmalloc(HV_HYP_PAGE_SIZE
, GFP_KERNEL
);
111 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page
);
113 void *hv_alloc_hyperv_zeroed_page(void)
115 if (PAGE_SIZE
== HV_HYP_PAGE_SIZE
)
116 return (void *)__get_free_page(GFP_KERNEL
| __GFP_ZERO
);
118 return kzalloc(HV_HYP_PAGE_SIZE
, GFP_KERNEL
);
120 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page
);
122 void hv_free_hyperv_page(void *addr
)
124 if (PAGE_SIZE
== HV_HYP_PAGE_SIZE
)
125 free_page((unsigned long)addr
);
129 EXPORT_SYMBOL_GPL(hv_free_hyperv_page
);
131 static void *hv_panic_page
;
134 * Boolean to control whether to report panic messages over Hyper-V.
136 * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
138 static int sysctl_record_panic_msg
= 1;
141 * sysctl option to allow the user to control whether kmsg data should be
142 * reported to Hyper-V on panic.
144 static struct ctl_table hv_ctl_table
[] = {
146 .procname
= "hyperv_record_panic_msg",
147 .data
= &sysctl_record_panic_msg
,
148 .maxlen
= sizeof(int),
150 .proc_handler
= proc_dointvec_minmax
,
151 .extra1
= SYSCTL_ZERO
,
156 static int hv_die_panic_notify_crash(struct notifier_block
*self
,
157 unsigned long val
, void *args
);
159 static struct notifier_block hyperv_die_report_block
= {
160 .notifier_call
= hv_die_panic_notify_crash
,
163 static struct notifier_block hyperv_panic_report_block
= {
164 .notifier_call
= hv_die_panic_notify_crash
,
168 * The following callback works both as die and panic notifier; its
169 * goal is to provide panic information to the hypervisor unless the
170 * kmsg dumper is used [see hv_kmsg_dump()], which provides more
171 * information but isn't always available.
173 * Notice that both the panic/die report notifiers are registered only
174 * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
176 static int hv_die_panic_notify_crash(struct notifier_block
*self
,
177 unsigned long val
, void *args
)
179 struct pt_regs
*regs
;
182 /* Don't notify Hyper-V unless we have a die oops event or panic. */
183 if (self
== &hyperv_panic_report_block
) {
185 regs
= current_pt_regs();
186 } else { /* die event */
191 regs
= ((struct die_args
*)args
)->regs
;
195 * Hyper-V should be notified only once about a panic/die. If we will
196 * be calling hv_kmsg_dump() later with kmsg data, don't do the
199 if (!sysctl_record_panic_msg
|| !hv_panic_page
)
200 hyperv_report_panic(regs
, val
, is_die
);
206 * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
207 * buffer and call into Hyper-V to transfer the data.
209 static void hv_kmsg_dump(struct kmsg_dumper
*dumper
,
210 struct kmsg_dump_detail
*detail
)
212 struct kmsg_dump_iter iter
;
213 size_t bytes_written
;
215 /* We are only interested in panics. */
216 if (detail
->reason
!= KMSG_DUMP_PANIC
|| !sysctl_record_panic_msg
)
220 * Write dump contents to the page. No need to synchronize; panic should
221 * be single-threaded.
223 kmsg_dump_rewind(&iter
);
224 kmsg_dump_get_buffer(&iter
, false, hv_panic_page
, HV_HYP_PAGE_SIZE
,
229 * P3 to contain the physical address of the panic page & P4 to
230 * contain the size of the panic data in that page. Rest of the
231 * registers are no-op when the NOTIFY_MSG flag is set.
233 hv_set_msr(HV_MSR_CRASH_P0
, 0);
234 hv_set_msr(HV_MSR_CRASH_P1
, 0);
235 hv_set_msr(HV_MSR_CRASH_P2
, 0);
236 hv_set_msr(HV_MSR_CRASH_P3
, virt_to_phys(hv_panic_page
));
237 hv_set_msr(HV_MSR_CRASH_P4
, bytes_written
);
240 * Let Hyper-V know there is crash data available along with
243 hv_set_msr(HV_MSR_CRASH_CTL
,
244 (HV_CRASH_CTL_CRASH_NOTIFY
|
245 HV_CRASH_CTL_CRASH_NOTIFY_MSG
));
248 static struct kmsg_dumper hv_kmsg_dumper
= {
249 .dump
= hv_kmsg_dump
,
252 static void hv_kmsg_dump_unregister(void)
254 kmsg_dump_unregister(&hv_kmsg_dumper
);
255 unregister_die_notifier(&hyperv_die_report_block
);
256 atomic_notifier_chain_unregister(&panic_notifier_list
,
257 &hyperv_panic_report_block
);
259 hv_free_hyperv_page(hv_panic_page
);
260 hv_panic_page
= NULL
;
263 static void hv_kmsg_dump_register(void)
267 hv_panic_page
= hv_alloc_hyperv_zeroed_page();
268 if (!hv_panic_page
) {
269 pr_err("Hyper-V: panic message page memory allocation failed\n");
273 ret
= kmsg_dump_register(&hv_kmsg_dumper
);
275 pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret
);
276 hv_free_hyperv_page(hv_panic_page
);
277 hv_panic_page
= NULL
;
281 int __init
hv_common_init(void)
284 union hv_hypervisor_version_info version
;
286 /* Get information about the Hyper-V host version */
287 if (!hv_get_hypervisor_version(&version
))
288 pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
289 version
.major_version
, version
.minor_version
,
290 version
.build_number
, version
.service_number
,
291 version
.service_pack
, version
.service_branch
);
293 if (hv_is_isolation_supported())
294 sysctl_record_panic_msg
= 0;
297 * Hyper-V expects to get crash register data or kmsg when
298 * crash enlightment is available and system crashes. Set
299 * crash_kexec_post_notifiers to be true to make sure that
300 * calling crash enlightment interface before running kdump
303 if (ms_hyperv
.misc_features
& HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE
) {
304 u64 hyperv_crash_ctl
;
306 crash_kexec_post_notifiers
= true;
307 pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
310 * Panic message recording (sysctl_record_panic_msg)
311 * is enabled by default in non-isolated guests and
312 * disabled by default in isolated guests; the panic
313 * message recording won't be available in isolated
314 * guests should the following registration fail.
316 hv_ctl_table_hdr
= register_sysctl("kernel", hv_ctl_table
);
317 if (!hv_ctl_table_hdr
)
318 pr_err("Hyper-V: sysctl table register error");
321 * Register for panic kmsg callback only if the right
322 * capability is supported by the hypervisor.
324 hyperv_crash_ctl
= hv_get_msr(HV_MSR_CRASH_CTL
);
325 if (hyperv_crash_ctl
& HV_CRASH_CTL_CRASH_NOTIFY_MSG
)
326 hv_kmsg_dump_register();
328 register_die_notifier(&hyperv_die_report_block
);
329 atomic_notifier_chain_register(&panic_notifier_list
,
330 &hyperv_panic_report_block
);
334 * Allocate the per-CPU state for the hypercall input arg.
335 * If this allocation fails, we will not be able to setup
336 * (per-CPU) hypercall input page and thus this failure is
339 hyperv_pcpu_input_arg
= alloc_percpu(void *);
340 BUG_ON(!hyperv_pcpu_input_arg
);
342 /* Allocate the per-CPU state for output arg for root */
343 if (hv_root_partition
) {
344 hyperv_pcpu_output_arg
= alloc_percpu(void *);
345 BUG_ON(!hyperv_pcpu_output_arg
);
348 hv_vp_index
= kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index
),
355 for (i
= 0; i
< num_possible_cpus(); i
++)
356 hv_vp_index
[i
] = VP_INVAL
;
361 void __init
ms_hyperv_late_init(void)
363 struct acpi_table_header
*header
;
369 * Seed the Linux random number generator with entropy provided by
370 * the Hyper-V host in ACPI table OEM0.
372 if (!IS_ENABLED(CONFIG_ACPI
))
375 status
= acpi_get_table("OEM0", 0, &header
);
376 if (ACPI_FAILURE(status
) || !header
)
380 * Since the "OEM0" table name is for OEM specific usage, verify
381 * that what we're seeing purports to be from Microsoft.
383 if (strncmp(header
->oem_table_id
, "MICROSFT", 8))
387 * Ensure the length is reasonable. Requiring at least 8 bytes and
388 * no more than 4K bytes is somewhat arbitrary and just protects
389 * against a malformed table. Hyper-V currently provides 64 bytes,
390 * but allow for a change in a later version.
392 if (header
->length
< sizeof(*header
) + 8 ||
393 header
->length
> sizeof(*header
) + SZ_4K
)
396 length
= header
->length
- sizeof(*header
);
397 randomdata
= (u8
*)(header
+ 1);
399 pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n",
402 add_bootloader_randomness(randomdata
, length
);
405 * To prevent the seed data from being visible in /sys/firmware/acpi,
406 * zero out the random data in the ACPI table and fixup the checksum.
407 * The zero'ing is done out of an abundance of caution in avoiding
408 * potential security risks to the rng. Similarly, reset the table
409 * length to just the header size so that a subsequent kexec doesn't
410 * try to use the zero'ed out random data.
412 for (i
= 0; i
< length
; i
++) {
413 header
->checksum
+= randomdata
[i
];
417 for (i
= 0; i
< sizeof(header
->length
); i
++)
418 header
->checksum
+= ((u8
*)&header
->length
)[i
];
419 header
->length
= sizeof(*header
);
420 for (i
= 0; i
< sizeof(header
->length
); i
++)
421 header
->checksum
-= ((u8
*)&header
->length
)[i
];
424 acpi_put_table(header
);
428 * Hyper-V specific initialization and die code for
429 * individual CPUs that is common across all architectures.
430 * Called by the CPU hotplug mechanism.
433 int hv_common_cpu_init(unsigned int cpu
)
435 void **inputarg
, **outputarg
;
438 int pgcount
= hv_root_partition
? 2 : 1;
442 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
443 flags
= irqs_disabled() ? GFP_ATOMIC
: GFP_KERNEL
;
445 inputarg
= (void **)this_cpu_ptr(hyperv_pcpu_input_arg
);
448 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
449 * allocated if this CPU was previously online and then taken offline
452 mem
= kmalloc(pgcount
* HV_HYP_PAGE_SIZE
, flags
);
456 if (hv_root_partition
) {
457 outputarg
= (void **)this_cpu_ptr(hyperv_pcpu_output_arg
);
458 *outputarg
= (char *)mem
+ HV_HYP_PAGE_SIZE
;
461 if (!ms_hyperv
.paravisor_present
&&
462 (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
463 ret
= set_memory_decrypted((unsigned long)mem
, pgcount
);
465 /* It may be unsafe to free 'mem' */
469 memset(mem
, 0x00, pgcount
* HV_HYP_PAGE_SIZE
);
473 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
474 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
475 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
476 * use hyperv_pcpu_input_arg as the hypercall input page, which
477 * must be a decrypted page in such a VM, but the page is still
478 * encrypted before set_memory_decrypted() returns. Fix this by
479 * setting *inputarg after the above set_memory_decrypted(): if
480 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
481 * HV_STATUS_INVALID_PARAMETER immediately, and the function
482 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
483 * which may be slightly slower than the hypercall, but still
484 * works correctly in such a VM.
489 msr_vp_index
= hv_get_msr(HV_MSR_VP_INDEX
);
491 hv_vp_index
[cpu
] = msr_vp_index
;
493 if (msr_vp_index
> hv_max_vp_index
)
494 hv_max_vp_index
= msr_vp_index
;
499 int hv_common_cpu_die(unsigned int cpu
)
502 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
503 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
504 * may be used by the Hyper-V vPCI driver in reassigning interrupts
505 * as part of the offlining process. The interrupt reassignment
506 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
507 * called this function.
509 * If a previously offlined CPU is brought back online again, the
510 * originally allocated memory is reused in hv_common_cpu_init().
516 /* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
517 bool hv_query_ext_cap(u64 cap_query
)
520 * The address of the 'hv_extended_cap' variable will be used as an
521 * output parameter to the hypercall below and so it should be
522 * compatible with 'virt_to_phys'. Which means, it's address should be
523 * directly mapped. Use 'static' to keep it compatible; stack variables
524 * can be virtually mapped, making them incompatible with
526 * Hypercall input/output addresses should also be 8-byte aligned.
528 static u64 hv_extended_cap
__aligned(8);
529 static bool hv_extended_cap_queried
;
533 * Querying extended capabilities is an extended hypercall. Check if the
534 * partition supports extended hypercall, first.
536 if (!(ms_hyperv
.priv_high
& HV_ENABLE_EXTENDED_HYPERCALLS
))
539 /* Extended capabilities do not change at runtime. */
540 if (hv_extended_cap_queried
)
541 return hv_extended_cap
& cap_query
;
543 status
= hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES
, NULL
,
547 * The query extended capabilities hypercall should not fail under
548 * any normal circumstances. Avoid repeatedly making the hypercall, on
551 hv_extended_cap_queried
= true;
552 if (!hv_result_success(status
)) {
553 pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
558 return hv_extended_cap
& cap_query
;
560 EXPORT_SYMBOL_GPL(hv_query_ext_cap
);
562 void hv_setup_dma_ops(struct device
*dev
, bool coherent
)
564 arch_setup_dma_ops(dev
, coherent
);
566 EXPORT_SYMBOL_GPL(hv_setup_dma_ops
);
568 bool hv_is_hibernation_supported(void)
570 return !hv_root_partition
&& acpi_sleep_state_supported(ACPI_STATE_S4
);
572 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported
);
575 * Default function to read the Hyper-V reference counter, independent
576 * of whether Hyper-V enlightened clocks/timers are being used. But on
577 * architectures where it is used, Hyper-V enlightenment code in
578 * hyperv_timer.c may override this function.
580 static u64
__hv_read_ref_counter(void)
582 return hv_get_msr(HV_MSR_TIME_REF_COUNT
);
585 u64 (*hv_read_reference_counter
)(void) = __hv_read_ref_counter
;
586 EXPORT_SYMBOL_GPL(hv_read_reference_counter
);
588 /* These __weak functions provide default "no-op" behavior and
589 * may be overridden by architecture specific versions. Architectures
590 * for which the default "no-op" behavior is sufficient can leave
591 * them unimplemented and not be cluttered with a bunch of stub
592 * functions in arch-specific code.
595 bool __weak
hv_is_isolation_supported(void)
599 EXPORT_SYMBOL_GPL(hv_is_isolation_supported
);
601 bool __weak
hv_isolation_type_snp(void)
605 EXPORT_SYMBOL_GPL(hv_isolation_type_snp
);
607 bool __weak
hv_isolation_type_tdx(void)
611 EXPORT_SYMBOL_GPL(hv_isolation_type_tdx
);
613 void __weak
hv_setup_vmbus_handler(void (*handler
)(void))
616 EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler
);
618 void __weak
hv_remove_vmbus_handler(void)
621 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler
);
623 void __weak
hv_setup_kexec_handler(void (*handler
)(void))
626 EXPORT_SYMBOL_GPL(hv_setup_kexec_handler
);
628 void __weak
hv_remove_kexec_handler(void)
631 EXPORT_SYMBOL_GPL(hv_remove_kexec_handler
);
633 void __weak
hv_setup_crash_handler(void (*handler
)(struct pt_regs
*regs
))
636 EXPORT_SYMBOL_GPL(hv_setup_crash_handler
);
638 void __weak
hv_remove_crash_handler(void)
641 EXPORT_SYMBOL_GPL(hv_remove_crash_handler
);
643 void __weak
hyperv_cleanup(void)
646 EXPORT_SYMBOL_GPL(hyperv_cleanup
);
648 u64 __weak
hv_ghcb_hypercall(u64 control
, void *input
, void *output
, u32 input_size
)
650 return HV_STATUS_INVALID_PARAMETER
;
652 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall
);
654 u64 __weak
hv_tdx_hypercall(u64 control
, u64 param1
, u64 param2
)
656 return HV_STATUS_INVALID_PARAMETER
;
658 EXPORT_SYMBOL_GPL(hv_tdx_hypercall
);