arch/x86/kernel/acpi/madt_wakeup.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 #include <linux/acpi.h>
   3 #include <linux/cpu.h>
   4 #include <linux/delay.h>
   5 #include <linux/io.h>
   6 #include <linux/kexec.h>
   7 #include <linux/memblock.h>
   8 #include <linux/pgtable.h>
   9 #include <linux/sched/hotplug.h>
  10 #include <asm/apic.h>
  11 #include <asm/barrier.h>
  12 #include <asm/init.h>
  13 #include <asm/intel_pt.h>
  14 #include <asm/nmi.h>
  15 #include <asm/processor.h>
  16 #include <asm/reboot.h>
  17
  18 /* Physical address of the Multiprocessor Wakeup Structure mailbox */
  19 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
  20
  21 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
  22 static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
  23
  24 static u64 acpi_mp_pgd __ro_after_init;
  25 static u64 acpi_mp_reset_vector_paddr __ro_after_init;
  26
  27 static void acpi_mp_stop_this_cpu(void)
  28 {
  29         asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
  30 }
  31
  32 static void acpi_mp_play_dead(void)
  33 {
  34         play_dead_common();
  35         asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
  36 }
  37
  38 static void acpi_mp_cpu_die(unsigned int cpu)
  39 {
  40         u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
  41         unsigned long timeout;
  42
  43         /*
  44          * Use TEST mailbox command to prove that BIOS got control over
  45          * the CPU before declaring it dead.
  46          *
  47          * BIOS has to clear 'command' field of the mailbox.
  48          */
  49         acpi_mp_wake_mailbox->apic_id = apicid;
  50         smp_store_release(&acpi_mp_wake_mailbox->command,
  51                           ACPI_MP_WAKE_COMMAND_TEST);
  52
  53         /* Don't wait longer than a second. */
  54         timeout = USEC_PER_SEC;
  55         while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
  56                 udelay(1);
  57
  58         if (!timeout)
  59                 pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
  60 }
  61
  62 /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
  63 static void __init *alloc_pgt_page(void *dummy)
  64 {
  65         return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
  66 }
  67
  68 static void __init free_pgt_page(void *pgt, void *dummy)
  69 {
  70         return memblock_free(pgt, PAGE_SIZE);
  71 }
  72
  73 /*
  74  * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
  75  * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
  76  * to the identity mapping and the function has be present at the same spot in
  77  * the virtual address space before and after switching page tables.
  78  */
  79 static int __init init_transition_pgtable(pgd_t *pgd)
  80 {
  81         pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
  82         unsigned long vaddr, paddr;
  83         p4d_t *p4d;
  84         pud_t *pud;
  85         pmd_t *pmd;
  86         pte_t *pte;
  87
  88         vaddr = (unsigned long)asm_acpi_mp_play_dead;
  89         pgd += pgd_index(vaddr);
  90         if (!pgd_present(*pgd)) {
  91                 p4d = (p4d_t *)alloc_pgt_page(NULL);
  92                 if (!p4d)
  93                         return -ENOMEM;
  94                 set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
  95         }
  96         p4d = p4d_offset(pgd, vaddr);
  97         if (!p4d_present(*p4d)) {
  98                 pud = (pud_t *)alloc_pgt_page(NULL);
  99                 if (!pud)
 100                         return -ENOMEM;
 101                 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
 102         }
 103         pud = pud_offset(p4d, vaddr);
 104         if (!pud_present(*pud)) {
 105                 pmd = (pmd_t *)alloc_pgt_page(NULL);
 106                 if (!pmd)
 107                         return -ENOMEM;
 108                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 109         }
 110         pmd = pmd_offset(pud, vaddr);
 111         if (!pmd_present(*pmd)) {
 112                 pte = (pte_t *)alloc_pgt_page(NULL);
 113                 if (!pte)
 114                         return -ENOMEM;
 115                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 116         }
 117         pte = pte_offset_kernel(pmd, vaddr);
 118
 119         paddr = __pa(vaddr);
 120         set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
 121
 122         return 0;
 123 }
 124
 125 static int __init acpi_mp_setup_reset(u64 reset_vector)
 126 {
 127         struct x86_mapping_info info = {
 128                 .alloc_pgt_page = alloc_pgt_page,
 129                 .free_pgt_page  = free_pgt_page,
 130                 .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
 131                 .kernpg_flag    = _KERNPG_TABLE_NOENC,
 132         };
 133         pgd_t *pgd;
 134
 135         pgd = alloc_pgt_page(NULL);
 136         if (!pgd)
 137                 return -ENOMEM;
 138
 139         for (int i = 0; i < nr_pfn_mapped; i++) {
 140                 unsigned long mstart, mend;
 141
 142                 mstart = pfn_mapped[i].start << PAGE_SHIFT;
 143                 mend   = pfn_mapped[i].end << PAGE_SHIFT;
 144                 if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
 145                         kernel_ident_mapping_free(&info, pgd);
 146                         return -ENOMEM;
 147                 }
 148         }
 149
 150         if (kernel_ident_mapping_init(&info, pgd,
 151                                       PAGE_ALIGN_DOWN(reset_vector),
 152                                       PAGE_ALIGN(reset_vector + 1))) {
 153                 kernel_ident_mapping_free(&info, pgd);
 154                 return -ENOMEM;
 155         }
 156
 157         if (init_transition_pgtable(pgd)) {
 158                 kernel_ident_mapping_free(&info, pgd);
 159                 return -ENOMEM;
 160         }
 161
 162         smp_ops.play_dead = acpi_mp_play_dead;
 163         smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
 164         smp_ops.cpu_die = acpi_mp_cpu_die;
 165
 166         acpi_mp_reset_vector_paddr = reset_vector;
 167         acpi_mp_pgd = __pa(pgd);
 168
 169         return 0;
 170 }
 171
 172 static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
 173 {
 174         if (!acpi_mp_wake_mailbox_paddr) {
 175                 pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
 176                 return -EOPNOTSUPP;
 177         }
 178
 179         /*
 180          * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
 181          *
 182          * Wakeup of secondary CPUs is fully serialized in the core code.
 183          * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
 184          */
 185         if (!acpi_mp_wake_mailbox) {
 186                 acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
 187                                                 sizeof(*acpi_mp_wake_mailbox),
 188                                                 MEMREMAP_WB);
 189         }
 190
 191         /*
 192          * Mailbox memory is shared between the firmware and OS. Firmware will
 193          * listen on mailbox command address, and once it receives the wakeup
 194          * command, the CPU associated with the given apicid will be booted.
 195          *
 196          * The value of 'apic_id' and 'wakeup_vector' must be visible to the
 197          * firmware before the wakeup command is visible.  smp_store_release()
 198          * ensures ordering and visibility.
 199          */
 200         acpi_mp_wake_mailbox->apic_id       = apicid;
 201         acpi_mp_wake_mailbox->wakeup_vector = start_ip;
 202         smp_store_release(&acpi_mp_wake_mailbox->command,
 203                           ACPI_MP_WAKE_COMMAND_WAKEUP);
 204
 205         /*
 206          * Wait for the CPU to wake up.
 207          *
 208          * The CPU being woken up is essentially in a spin loop waiting to be
 209          * woken up. It should not take long for it wake up and acknowledge by
 210          * zeroing out ->command.
 211          *
 212          * ACPI specification doesn't provide any guidance on how long kernel
 213          * has to wait for a wake up acknowledgment. It also doesn't provide
 214          * a way to cancel a wake up request if it takes too long.
 215          *
 216          * In TDX environment, the VMM has control over how long it takes to
 217          * wake up secondary. It can postpone scheduling secondary vCPU
 218          * indefinitely. Giving up on wake up request and reporting error opens
 219          * possible attack vector for VMM: it can wake up a secondary CPU when
 220          * kernel doesn't expect it. Wait until positive result of the wake up
 221          * request.
 222          */
 223         while (READ_ONCE(acpi_mp_wake_mailbox->command))
 224                 cpu_relax();
 225
 226         return 0;
 227 }
 228
 229 static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
 230 {
 231         cpu_hotplug_disable_offlining();
 232
 233         /*
 234          * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
 235          * limits kexec: the second kernel won't be able to use more than one CPU.
 236          *
 237          * To prevent a kexec kernel from onlining secondary CPUs invalidate the
 238          * mailbox address in the ACPI MADT wakeup structure which prevents a
 239          * kexec kernel to use it.
 240          *
 241          * This is safe as the booting kernel has the mailbox address cached
 242          * already and acpi_wakeup_cpu() uses the cached value to bring up the
 243          * secondary CPUs.
 244          *
 245          * Note: This is a Linux specific convention and not covered by the
 246          *       ACPI specification.
 247          */
 248         mp_wake->mailbox_address = 0;
 249 }
 250
 251 int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
 252                               const unsigned long end)
 253 {
 254         struct acpi_madt_multiproc_wakeup *mp_wake;
 255
 256         mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
 257
 258         /*
 259          * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
 260          * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
 261          * than the actual size of the MP wakeup entry in ACPI table because the
 262          * 'reset_vector' is only available in the V1 MP wakeup structure.
 263          */
 264         if (!mp_wake)
 265                 return -EINVAL;
 266         if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
 267                 return -EINVAL;
 268         if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
 269                 return -EINVAL;
 270
 271         acpi_table_print_madt_entry(&header->common);
 272
 273         acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
 274
 275         if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
 276             mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
 277                 if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
 278                         pr_warn("Failed to setup MADT reset vector\n");
 279                         acpi_mp_disable_offlining(mp_wake);
 280                 }
 281         } else {
 282                 /*
 283                  * CPU offlining requires version 1 of the ACPI MADT wakeup
 284                  * structure.
 285                  */
 286                 acpi_mp_disable_offlining(mp_wake);
 287         }
 288
 289         apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
 290
 291         return 0;
 292 }