arch/x86/kvm/cpuid.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  * cpuid support routines
   5  *
   6  * derived from arch/x86/kvm/x86.c
   7  *
   8  * Copyright 2011 Red Hat, Inc. and/or its affiliates.
   9  * Copyright IBM Corporation, 2008
  10  */
  11
  12 #include <linux/kvm_host.h>
  13 #include <linux/export.h>
  14 #include <linux/vmalloc.h>
  15 #include <linux/uaccess.h>
  16 #include <linux/sched/stat.h>
  17
  18 #include <asm/processor.h>
  19 #include <asm/user.h>
  20 #include <asm/fpu/xstate.h>
  21 #include "cpuid.h"
  22 #include "lapic.h"
  23 #include "mmu.h"
  24 #include "trace.h"
  25 #include "pmu.h"
  26
  27 /*
  28  * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
  29  * aligned to sizeof(unsigned long) because it's not accessed via bitops.
  30  */
  31 u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
  32 EXPORT_SYMBOL_GPL(kvm_cpu_caps);
  33
  34 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
  35 {
  36         int feature_bit = 0;
  37         u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
  38
  39         xstate_bv &= XFEATURE_MASK_EXTEND;
  40         while (xstate_bv) {
  41                 if (xstate_bv & 0x1) {
  42                         u32 eax, ebx, ecx, edx, offset;
  43                         cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
  44                         offset = compacted ? ret : ebx;
  45                         ret = max(ret, offset + eax);
  46                 }
  47
  48                 xstate_bv >>= 1;
  49                 feature_bit++;
  50         }
  51
  52         return ret;
  53 }
  54
  55 #define F feature_bit
  56
  57 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
  58 {
  59         struct kvm_cpuid_entry2 *best;
  60         struct kvm_lapic *apic = vcpu->arch.apic;
  61
  62         best = kvm_find_cpuid_entry(vcpu, 1, 0);
  63         if (!best)
  64                 return 0;
  65
  66         /* Update OSXSAVE bit */
  67         if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1)
  68                 cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
  69                                    kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
  70
  71         cpuid_entry_change(best, X86_FEATURE_APIC,
  72                            vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
  73
  74         if (apic) {
  75                 if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
  76                         apic->lapic_timer.timer_mode_mask = 3 << 17;
  77                 else
  78                         apic->lapic_timer.timer_mode_mask = 1 << 17;
  79         }
  80
  81         best = kvm_find_cpuid_entry(vcpu, 7, 0);
  82         if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
  83                 cpuid_entry_change(best, X86_FEATURE_OSPKE,
  84                                    kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
  85
  86         best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
  87         if (!best) {
  88                 vcpu->arch.guest_supported_xcr0 = 0;
  89                 vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
  90         } else {
  91                 vcpu->arch.guest_supported_xcr0 =
  92                         (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
  93                 vcpu->arch.guest_xstate_size = best->ebx =
  94                         xstate_required_size(vcpu->arch.xcr0, false);
  95         }
  96
  97         best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
  98         if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
  99                      cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
 100                 best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 101
 102         /*
 103          * The existing code assumes virtual address is 48-bit or 57-bit in the
 104          * canonical address checks; exit if it is ever changed.
 105          */
 106         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
 107         if (best) {
 108                 int vaddr_bits = (best->eax & 0xff00) >> 8;
 109
 110                 if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
 111                         return -EINVAL;
 112         }
 113
 114         best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
 115         if (kvm_hlt_in_guest(vcpu->kvm) && best &&
 116                 (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
 117                 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
 118
 119         if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
 120                 best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
 121                 if (best)
 122                         cpuid_entry_change(best, X86_FEATURE_MWAIT,
 123                                            vcpu->arch.ia32_misc_enable_msr &
 124                                            MSR_IA32_MISC_ENABLE_MWAIT);
 125         }
 126
 127         /* Update physical-address width */
 128         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 129         kvm_mmu_reset_context(vcpu);
 130
 131         kvm_pmu_refresh(vcpu);
 132         return 0;
 133 }
 134
 135 static int is_efer_nx(void)
 136 {
 137         return host_efer & EFER_NX;
 138 }
 139
 140 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 141 {
 142         int i;
 143         struct kvm_cpuid_entry2 *e, *entry;
 144
 145         entry = NULL;
 146         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 147                 e = &vcpu->arch.cpuid_entries[i];
 148                 if (e->function == 0x80000001) {
 149                         entry = e;
 150                         break;
 151                 }
 152         }
 153         if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
 154                 cpuid_entry_clear(entry, X86_FEATURE_NX);
 155                 printk(KERN_INFO "kvm: guest NX capability removed\n");
 156         }
 157 }
 158
 159 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
 160 {
 161         struct kvm_cpuid_entry2 *best;
 162
 163         best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
 164         if (!best || best->eax < 0x80000008)
 165                 goto not_found;
 166         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
 167         if (best)
 168                 return best->eax & 0xff;
 169 not_found:
 170         return 36;
 171 }
 172 EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
 173
 174 /* when an old userspace process fills a new kernel module */
 175 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 176                              struct kvm_cpuid *cpuid,
 177                              struct kvm_cpuid_entry __user *entries)
 178 {
 179         int r, i;
 180         struct kvm_cpuid_entry *cpuid_entries = NULL;
 181
 182         r = -E2BIG;
 183         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 184                 goto out;
 185         r = -ENOMEM;
 186         if (cpuid->nent) {
 187                 cpuid_entries =
 188                         vmalloc(array_size(sizeof(struct kvm_cpuid_entry),
 189                                            cpuid->nent));
 190                 if (!cpuid_entries)
 191                         goto out;
 192                 r = -EFAULT;
 193                 if (copy_from_user(cpuid_entries, entries,
 194                                    cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 195                         goto out;
 196         }
 197         for (i = 0; i < cpuid->nent; i++) {
 198                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 199                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
 200                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
 201                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
 202                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
 203                 vcpu->arch.cpuid_entries[i].index = 0;
 204                 vcpu->arch.cpuid_entries[i].flags = 0;
 205                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
 206                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
 207                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
 208         }
 209         vcpu->arch.cpuid_nent = cpuid->nent;
 210         cpuid_fix_nx_cap(vcpu);
 211         kvm_apic_set_version(vcpu);
 212         kvm_x86_ops.cpuid_update(vcpu);
 213         r = kvm_update_cpuid(vcpu);
 214
 215 out:
 216         vfree(cpuid_entries);
 217         return r;
 218 }
 219
 220 int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 221                               struct kvm_cpuid2 *cpuid,
 222                               struct kvm_cpuid_entry2 __user *entries)
 223 {
 224         int r;
 225
 226         r = -E2BIG;
 227         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 228                 goto out;
 229         r = -EFAULT;
 230         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 231                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 232                 goto out;
 233         vcpu->arch.cpuid_nent = cpuid->nent;
 234         kvm_apic_set_version(vcpu);
 235         kvm_x86_ops.cpuid_update(vcpu);
 236         r = kvm_update_cpuid(vcpu);
 237 out:
 238         return r;
 239 }
 240
 241 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 242                               struct kvm_cpuid2 *cpuid,
 243                               struct kvm_cpuid_entry2 __user *entries)
 244 {
 245         int r;
 246
 247         r = -E2BIG;
 248         if (cpuid->nent < vcpu->arch.cpuid_nent)
 249                 goto out;
 250         r = -EFAULT;
 251         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
 252                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
 253                 goto out;
 254         return 0;
 255
 256 out:
 257         cpuid->nent = vcpu->arch.cpuid_nent;
 258         return r;
 259 }
 260
 261 static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
 262 {
 263         const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32);
 264         struct kvm_cpuid_entry2 entry;
 265
 266         reverse_cpuid_check(leaf);
 267         kvm_cpu_caps[leaf] &= mask;
 268
 269         cpuid_count(cpuid.function, cpuid.index,
 270                     &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
 271
 272         kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg);
 273 }
 274
 275 void kvm_set_cpu_caps(void)
 276 {
 277         unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
 278 #ifdef CONFIG_X86_64
 279         unsigned int f_gbpages = F(GBPAGES);
 280         unsigned int f_lm = F(LM);
 281 #else
 282         unsigned int f_gbpages = 0;
 283         unsigned int f_lm = 0;
 284 #endif
 285
 286         BUILD_BUG_ON(sizeof(kvm_cpu_caps) >
 287                      sizeof(boot_cpu_data.x86_capability));
 288
 289         memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability,
 290                sizeof(kvm_cpu_caps));
 291
 292         kvm_cpu_cap_mask(CPUID_1_ECX,
 293                 /*
 294                  * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
 295                  * advertised to guests via CPUID!
 296                  */
 297                 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 298                 0 /* DS-CPL, VMX, SMX, EST */ |
 299                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 300                 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
 301                 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
 302                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 303                 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
 304                 F(F16C) | F(RDRAND)
 305         );
 306         /* KVM emulates x2apic in software irrespective of host support. */
 307         kvm_cpu_cap_set(X86_FEATURE_X2APIC);
 308
 309         kvm_cpu_cap_mask(CPUID_1_EDX,
 310                 F(FPU) | F(VME) | F(DE) | F(PSE) |
 311                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 312                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
 313                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
 314                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
 315                 0 /* Reserved, DS, ACPI */ | F(MMX) |
 316                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
 317                 0 /* HTT, TM, Reserved, PBE */
 318         );
 319
 320         kvm_cpu_cap_mask(CPUID_7_0_EBX,
 321                 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 322                 F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
 323                 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
 324                 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
 325                 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
 326         );
 327
 328         kvm_cpu_cap_mask(CPUID_7_ECX,
 329                 F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
 330                 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 331                 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
 332                 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/
 333         );
 334         /* Set LA57 based on hardware capability. */
 335         if (cpuid_ecx(7) & F(LA57))
 336                 kvm_cpu_cap_set(X86_FEATURE_LA57);
 337
 338         kvm_cpu_cap_mask(CPUID_7_EDX,
 339                 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
 340                 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
 341                 F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM)
 342         );
 343
 344         /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
 345         kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST);
 346         kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES);
 347
 348         if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
 349                 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
 350         if (boot_cpu_has(X86_FEATURE_STIBP))
 351                 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
 352         if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
 353                 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
 354
 355         kvm_cpu_cap_mask(CPUID_7_1_EAX,
 356                 F(AVX512_BF16)
 357         );
 358
 359         kvm_cpu_cap_mask(CPUID_D_1_EAX,
 360                 F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
 361         );
 362
 363         kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
 364                 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 365                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 366                 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 367                 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
 368                 F(TOPOEXT) | F(PERFCTR_CORE)
 369         );
 370
 371         kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
 372                 F(FPU) | F(VME) | F(DE) | F(PSE) |
 373                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 374                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
 375                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
 376                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
 377                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
 378                 F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
 379                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
 380         );
 381
 382         if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
 383                 kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
 384
 385         kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
 386                 F(CLZERO) | F(XSAVEERPTR) |
 387                 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
 388                 F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
 389         );
 390
 391         /*
 392          * AMD has separate bits for each SPEC_CTRL bit.
 393          * arch/x86/kernel/cpu/bugs.c is kind enough to
 394          * record that in cpufeatures so use them.
 395          */
 396         if (boot_cpu_has(X86_FEATURE_IBPB))
 397                 kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
 398         if (boot_cpu_has(X86_FEATURE_IBRS))
 399                 kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
 400         if (boot_cpu_has(X86_FEATURE_STIBP))
 401                 kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
 402         if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
 403                 kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
 404         if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
 405                 kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
 406         /*
 407          * The preference is to use SPEC CTRL MSR instead of the
 408          * VIRT_SPEC MSR.
 409          */
 410         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
 411             !boot_cpu_has(X86_FEATURE_AMD_SSBD))
 412                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 413
 414         /*
 415          * Hide all SVM features by default, SVM will set the cap bits for
 416          * features it emulates and/or exposes for L1.
 417          */
 418         kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
 419
 420         kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
 421                 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
 422                 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 423                 F(PMM) | F(PMM_EN)
 424         );
 425 }
 426 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
 427
 428 struct kvm_cpuid_array {
 429         struct kvm_cpuid_entry2 *entries;
 430         const int maxnent;
 431         int nent;
 432 };
 433
 434 static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
 435                                               u32 function, u32 index)
 436 {
 437         struct kvm_cpuid_entry2 *entry;
 438
 439         if (array->nent >= array->maxnent)
 440                 return NULL;
 441
 442         entry = &array->entries[array->nent++];
 443
 444         entry->function = function;
 445         entry->index = index;
 446         entry->flags = 0;
 447
 448         cpuid_count(entry->function, entry->index,
 449                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
 450
 451         switch (function) {
 452         case 4:
 453         case 7:
 454         case 0xb:
 455         case 0xd:
 456         case 0xf:
 457         case 0x10:
 458         case 0x12:
 459         case 0x14:
 460         case 0x17:
 461         case 0x18:
 462         case 0x1f:
 463         case 0x8000001d:
 464                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 465                 break;
 466         }
 467
 468         return entry;
 469 }
 470
 471 static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 472 {
 473         struct kvm_cpuid_entry2 *entry;
 474
 475         if (array->nent >= array->maxnent)
 476                 return -E2BIG;
 477
 478         entry = &array->entries[array->nent];
 479         entry->function = func;
 480         entry->index = 0;
 481         entry->flags = 0;
 482
 483         switch (func) {
 484         case 0:
 485                 entry->eax = 7;
 486                 ++array->nent;
 487                 break;
 488         case 1:
 489                 entry->ecx = F(MOVBE);
 490                 ++array->nent;
 491                 break;
 492         case 7:
 493                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 494                 entry->eax = 0;
 495                 entry->ecx = F(RDPID);
 496                 ++array->nent;
 497         default:
 498                 break;
 499         }
 500
 501         return 0;
 502 }
 503
 504 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 505 {
 506         struct kvm_cpuid_entry2 *entry;
 507         int r, i, max_idx;
 508
 509         /* all calls to cpuid_count() should be made on the same cpu */
 510         get_cpu();
 511
 512         r = -E2BIG;
 513
 514         entry = do_host_cpuid(array, function, 0);
 515         if (!entry)
 516                 goto out;
 517
 518         switch (function) {
 519         case 0:
 520                 /* Limited to the highest leaf implemented in KVM. */
 521                 entry->eax = min(entry->eax, 0x1fU);
 522                 break;
 523         case 1:
 524                 cpuid_entry_override(entry, CPUID_1_EDX);
 525                 cpuid_entry_override(entry, CPUID_1_ECX);
 526                 break;
 527         case 2:
 528                 /*
 529                  * On ancient CPUs, function 2 entries are STATEFUL.  That is,
 530                  * CPUID(function=2, index=0) may return different results each
 531                  * time, with the least-significant byte in EAX enumerating the
 532                  * number of times software should do CPUID(2, 0).
 533                  *
 534                  * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
 535                  * idiotic.  Intel's SDM states that EAX & 0xff "will always
 536                  * return 01H. Software should ignore this value and not
 537                  * interpret it as an informational descriptor", while AMD's
 538                  * APM states that CPUID(2) is reserved.
 539                  *
 540                  * WARN if a frankenstein CPU that supports virtualization and
 541                  * a stateful CPUID.0x2 is encountered.
 542                  */
 543                 WARN_ON_ONCE((entry->eax & 0xff) > 1);
 544                 break;
 545         /* functions 4 and 0x8000001d have additional index. */
 546         case 4:
 547         case 0x8000001d:
 548                 /*
 549                  * Read entries until the cache type in the previous entry is
 550                  * zero, i.e. indicates an invalid entry.
 551                  */
 552                 for (i = 1; entry->eax & 0x1f; ++i) {
 553                         entry = do_host_cpuid(array, function, i);
 554                         if (!entry)
 555                                 goto out;
 556                 }
 557                 break;
 558         case 6: /* Thermal management */
 559                 entry->eax = 0x4; /* allow ARAT */
 560                 entry->ebx = 0;
 561                 entry->ecx = 0;
 562                 entry->edx = 0;
 563                 break;
 564         /* function 7 has additional index. */
 565         case 7:
 566                 entry->eax = min(entry->eax, 1u);
 567                 cpuid_entry_override(entry, CPUID_7_0_EBX);
 568                 cpuid_entry_override(entry, CPUID_7_ECX);
 569                 cpuid_entry_override(entry, CPUID_7_EDX);
 570
 571                 /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
 572                 if (entry->eax == 1) {
 573                         entry = do_host_cpuid(array, function, 1);
 574                         if (!entry)
 575                                 goto out;
 576
 577                         cpuid_entry_override(entry, CPUID_7_1_EAX);
 578                         entry->ebx = 0;
 579                         entry->ecx = 0;
 580                         entry->edx = 0;
 581                 }
 582                 break;
 583         case 9:
 584                 break;
 585         case 0xa: { /* Architectural Performance Monitoring */
 586                 struct x86_pmu_capability cap;
 587                 union cpuid10_eax eax;
 588                 union cpuid10_edx edx;
 589
 590                 perf_get_x86_pmu_capability(&cap);
 591
 592                 /*
 593                  * Only support guest architectural pmu on a host
 594                  * with architectural pmu.
 595                  */
 596                 if (!cap.version)
 597                         memset(&cap, 0, sizeof(cap));
 598
 599                 eax.split.version_id = min(cap.version, 2);
 600                 eax.split.num_counters = cap.num_counters_gp;
 601                 eax.split.bit_width = cap.bit_width_gp;
 602                 eax.split.mask_length = cap.events_mask_len;
 603
 604                 edx.split.num_counters_fixed = cap.num_counters_fixed;
 605                 edx.split.bit_width_fixed = cap.bit_width_fixed;
 606                 edx.split.reserved = 0;
 607
 608                 entry->eax = eax.full;
 609                 entry->ebx = cap.events_mask;
 610                 entry->ecx = 0;
 611                 entry->edx = edx.full;
 612                 break;
 613         }
 614         /*
 615          * Per Intel's SDM, the 0x1f is a superset of 0xb,
 616          * thus they can be handled by common code.
 617          */
 618         case 0x1f:
 619         case 0xb:
 620                 /*
 621                  * Populate entries until the level type (ECX[15:8]) of the
 622                  * previous entry is zero.  Note, CPUID EAX.{0x1f,0xb}.0 is
 623                  * the starting entry, filled by the primary do_host_cpuid().
 624                  */
 625                 for (i = 1; entry->ecx & 0xff00; ++i) {
 626                         entry = do_host_cpuid(array, function, i);
 627                         if (!entry)
 628                                 goto out;
 629                 }
 630                 break;
 631         case 0xd:
 632                 entry->eax &= supported_xcr0;
 633                 entry->ebx = xstate_required_size(supported_xcr0, false);
 634                 entry->ecx = entry->ebx;
 635                 entry->edx &= supported_xcr0 >> 32;
 636                 if (!supported_xcr0)
 637                         break;
 638
 639                 entry = do_host_cpuid(array, function, 1);
 640                 if (!entry)
 641                         goto out;
 642
 643                 cpuid_entry_override(entry, CPUID_D_1_EAX);
 644                 if (entry->eax & (F(XSAVES)|F(XSAVEC)))
 645                         entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
 646                                                           true);
 647                 else {
 648                         WARN_ON_ONCE(supported_xss != 0);
 649                         entry->ebx = 0;
 650                 }
 651                 entry->ecx &= supported_xss;
 652                 entry->edx &= supported_xss >> 32;
 653
 654                 for (i = 2; i < 64; ++i) {
 655                         bool s_state;
 656                         if (supported_xcr0 & BIT_ULL(i))
 657                                 s_state = false;
 658                         else if (supported_xss & BIT_ULL(i))
 659                                 s_state = true;
 660                         else
 661                                 continue;
 662
 663                         entry = do_host_cpuid(array, function, i);
 664                         if (!entry)
 665                                 goto out;
 666
 667                         /*
 668                          * The supported check above should have filtered out
 669                          * invalid sub-leafs.  Only valid sub-leafs should
 670                          * reach this point, and they should have a non-zero
 671                          * save state size.  Furthermore, check whether the
 672                          * processor agrees with supported_xcr0/supported_xss
 673                          * on whether this is an XCR0- or IA32_XSS-managed area.
 674                          */
 675                         if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
 676                                 --array->nent;
 677                                 continue;
 678                         }
 679                         entry->edx = 0;
 680                 }
 681                 break;
 682         /* Intel PT */
 683         case 0x14:
 684                 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
 685                         entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 686                         break;
 687                 }
 688
 689                 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
 690                         if (!do_host_cpuid(array, function, i))
 691                                 goto out;
 692                 }
 693                 break;
 694         case KVM_CPUID_SIGNATURE: {
 695                 static const char signature[12] = "KVMKVMKVM\0\0";
 696                 const u32 *sigptr = (const u32 *)signature;
 697                 entry->eax = KVM_CPUID_FEATURES;
 698                 entry->ebx = sigptr[0];
 699                 entry->ecx = sigptr[1];
 700                 entry->edx = sigptr[2];
 701                 break;
 702         }
 703         case KVM_CPUID_FEATURES:
 704                 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
 705                              (1 << KVM_FEATURE_NOP_IO_DELAY) |
 706                              (1 << KVM_FEATURE_CLOCKSOURCE2) |
 707                              (1 << KVM_FEATURE_ASYNC_PF) |
 708                              (1 << KVM_FEATURE_PV_EOI) |
 709                              (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
 710                              (1 << KVM_FEATURE_PV_UNHALT) |
 711                              (1 << KVM_FEATURE_PV_TLB_FLUSH) |
 712                              (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
 713                              (1 << KVM_FEATURE_PV_SEND_IPI) |
 714                              (1 << KVM_FEATURE_POLL_CONTROL) |
 715                              (1 << KVM_FEATURE_PV_SCHED_YIELD);
 716
 717                 if (sched_info_on())
 718                         entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
 719
 720                 entry->ebx = 0;
 721                 entry->ecx = 0;
 722                 entry->edx = 0;
 723                 break;
 724         case 0x80000000:
 725                 entry->eax = min(entry->eax, 0x8000001f);
 726                 break;
 727         case 0x80000001:
 728                 cpuid_entry_override(entry, CPUID_8000_0001_EDX);
 729                 cpuid_entry_override(entry, CPUID_8000_0001_ECX);
 730                 break;
 731         case 0x80000007: /* Advanced power management */
 732                 /* invariant TSC is CPUID.80000007H:EDX[8] */
 733                 entry->edx &= (1 << 8);
 734                 /* mask against host */
 735                 entry->edx &= boot_cpu_data.x86_power;
 736                 entry->eax = entry->ebx = entry->ecx = 0;
 737                 break;
 738         case 0x80000008: {
 739                 unsigned g_phys_as = (entry->eax >> 16) & 0xff;
 740                 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
 741                 unsigned phys_as = entry->eax & 0xff;
 742
 743                 if (!g_phys_as)
 744                         g_phys_as = phys_as;
 745                 entry->eax = g_phys_as | (virt_as << 8);
 746                 entry->edx = 0;
 747                 cpuid_entry_override(entry, CPUID_8000_0008_EBX);
 748                 break;
 749         }
 750         case 0x8000000A:
 751                 if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
 752                         entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 753                         break;
 754                 }
 755                 entry->eax = 1; /* SVM revision 1 */
 756                 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
 757                                    ASID emulation to nested SVM */
 758                 entry->ecx = 0; /* Reserved */
 759                 cpuid_entry_override(entry, CPUID_8000_000A_EDX);
 760                 break;
 761         case 0x80000019:
 762                 entry->ecx = entry->edx = 0;
 763                 break;
 764         case 0x8000001a:
 765         case 0x8000001e:
 766                 break;
 767         /* Support memory encryption cpuid if host supports it */
 768         case 0x8000001F:
 769                 if (!boot_cpu_has(X86_FEATURE_SEV))
 770                         entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 771                 break;
 772         /*Add support for Centaur's CPUID instruction*/
 773         case 0xC0000000:
 774                 /*Just support up to 0xC0000004 now*/
 775                 entry->eax = min(entry->eax, 0xC0000004);
 776                 break;
 777         case 0xC0000001:
 778                 cpuid_entry_override(entry, CPUID_C000_0001_EDX);
 779                 break;
 780         case 3: /* Processor serial number */
 781         case 5: /* MONITOR/MWAIT */
 782         case 0xC0000002:
 783         case 0xC0000003:
 784         case 0xC0000004:
 785         default:
 786                 entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 787                 break;
 788         }
 789
 790         r = 0;
 791
 792 out:
 793         put_cpu();
 794
 795         return r;
 796 }
 797
 798 static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
 799                          unsigned int type)
 800 {
 801         if (type == KVM_GET_EMULATED_CPUID)
 802                 return __do_cpuid_func_emulated(array, func);
 803
 804         return __do_cpuid_func(array, func);
 805 }
 806
 807 #define CENTAUR_CPUID_SIGNATURE 0xC0000000
 808
 809 static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
 810                           unsigned int type)
 811 {
 812         u32 limit;
 813         int r;
 814
 815         if (func == CENTAUR_CPUID_SIGNATURE &&
 816             boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR)
 817                 return 0;
 818
 819         r = do_cpuid_func(array, func, type);
 820         if (r)
 821                 return r;
 822
 823         limit = array->entries[array->nent - 1].eax;
 824         for (func = func + 1; func <= limit; ++func) {
 825                 r = do_cpuid_func(array, func, type);
 826                 if (r)
 827                         break;
 828         }
 829
 830         return r;
 831 }
 832
 833 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
 834                                  __u32 num_entries, unsigned int ioctl_type)
 835 {
 836         int i;
 837         __u32 pad[3];
 838
 839         if (ioctl_type != KVM_GET_EMULATED_CPUID)
 840                 return false;
 841
 842         /*
 843          * We want to make sure that ->padding is being passed clean from
 844          * userspace in case we want to use it for something in the future.
 845          *
 846          * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
 847          * have to give ourselves satisfied only with the emulated side. /me
 848          * sheds a tear.
 849          */
 850         for (i = 0; i < num_entries; i++) {
 851                 if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
 852                         return true;
 853
 854                 if (pad[0] || pad[1] || pad[2])
 855                         return true;
 856         }
 857         return false;
 858 }
 859
 860 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
 861                             struct kvm_cpuid_entry2 __user *entries,
 862                             unsigned int type)
 863 {
 864         static const u32 funcs[] = {
 865                 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
 866         };
 867
 868         struct kvm_cpuid_array array = {
 869                 .nent = 0,
 870                 .maxnent = cpuid->nent,
 871         };
 872         int r, i;
 873
 874         if (cpuid->nent < 1)
 875                 return -E2BIG;
 876         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
 877                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
 878
 879         if (sanity_check_entries(entries, cpuid->nent, type))
 880                 return -EINVAL;
 881
 882         array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
 883                                            cpuid->nent));
 884         if (!array.entries)
 885                 return -ENOMEM;
 886
 887         for (i = 0; i < ARRAY_SIZE(funcs); i++) {
 888                 r = get_cpuid_func(&array, funcs[i], type);
 889                 if (r)
 890                         goto out_free;
 891         }
 892         cpuid->nent = array.nent;
 893
 894         if (copy_to_user(entries, array.entries,
 895                          array.nent * sizeof(struct kvm_cpuid_entry2)))
 896                 r = -EFAULT;
 897
 898 out_free:
 899         vfree(array.entries);
 900         return r;
 901 }
 902
 903 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 904                                               u32 function, u32 index)
 905 {
 906         struct kvm_cpuid_entry2 *e;
 907         int i;
 908
 909         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 910                 e = &vcpu->arch.cpuid_entries[i];
 911
 912                 if (e->function == function && (e->index == index ||
 913                     !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
 914                         return e;
 915         }
 916         return NULL;
 917 }
 918 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
 919
 920 /*
 921  * Intel CPUID semantics treats any query for an out-of-range leaf as if the
 922  * highest basic leaf (i.e. CPUID.0H:EAX) were requested.  AMD CPUID semantics
 923  * returns all zeroes for any undefined leaf, whether or not the leaf is in
 924  * range.  Centaur/VIA follows Intel semantics.
 925  *
 926  * A leaf is considered out-of-range if its function is higher than the maximum
 927  * supported leaf of its associated class or if its associated class does not
 928  * exist.
 929  *
 930  * There are three primary classes to be considered, with their respective
 931  * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive.  A primary
 932  * class exists if a guest CPUID entry for its <base> leaf exists.  For a given
 933  * class, CPUID.<base>.EAX contains the max supported leaf for the class.
 934  *
 935  *  - Basic:      0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
 936  *  - Hypervisor: 0x40000000 - 0x4fffffff
 937  *  - Extended:   0x80000000 - 0xbfffffff
 938  *  - Centaur:    0xc0000000 - 0xcfffffff
 939  *
 940  * The Hypervisor class is further subdivided into sub-classes that each act as
 941  * their own indepdent class associated with a 0x100 byte range.  E.g. if Qemu
 942  * is advertising support for both HyperV and KVM, the resulting Hypervisor
 943  * CPUID sub-classes are:
 944  *
 945  *  - HyperV:     0x40000000 - 0x400000ff
 946  *  - KVM:        0x40000100 - 0x400001ff
 947  */
 948 static struct kvm_cpuid_entry2 *
 949 get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
 950 {
 951         struct kvm_cpuid_entry2 *basic, *class;
 952         u32 function = *fn_ptr;
 953
 954         basic = kvm_find_cpuid_entry(vcpu, 0, 0);
 955         if (!basic)
 956                 return NULL;
 957
 958         if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
 959             is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
 960                 return NULL;
 961
 962         if (function >= 0x40000000 && function <= 0x4fffffff)
 963                 class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0);
 964         else if (function >= 0xc0000000)
 965                 class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0);
 966         else
 967                 class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
 968
 969         if (class && function <= class->eax)
 970                 return NULL;
 971
 972         /*
 973          * Leaf specific adjustments are also applied when redirecting to the
 974          * max basic entry, e.g. if the max basic leaf is 0xb but there is no
 975          * entry for CPUID.0xb.index (see below), then the output value for EDX
 976          * needs to be pulled from CPUID.0xb.1.
 977          */
 978         *fn_ptr = basic->eax;
 979
 980         /*
 981          * The class does not exist or the requested function is out of range;
 982          * the effective CPUID entry is the max basic leaf.  Note, the index of
 983          * the original requested leaf is observed!
 984          */
 985         return kvm_find_cpuid_entry(vcpu, basic->eax, index);
 986 }
 987
 988 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
 989                u32 *ecx, u32 *edx, bool exact_only)
 990 {
 991         u32 orig_function = *eax, function = *eax, index = *ecx;
 992         struct kvm_cpuid_entry2 *entry;
 993         bool exact, used_max_basic = false;
 994
 995         entry = kvm_find_cpuid_entry(vcpu, function, index);
 996         exact = !!entry;
 997
 998         if (!entry && !exact_only) {
 999                 entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
1000                 used_max_basic = !!entry;
1001         }
1002
1003         if (entry) {
1004                 *eax = entry->eax;
1005                 *ebx = entry->ebx;
1006                 *ecx = entry->ecx;
1007                 *edx = entry->edx;
1008                 if (function == 7 && index == 0) {
1009                         u64 data;
1010                         if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
1011                             (data & TSX_CTRL_CPUID_CLEAR))
1012                                 *ebx &= ~(F(RTM) | F(HLE));
1013                 }
1014         } else {
1015                 *eax = *ebx = *ecx = *edx = 0;
1016                 /*
1017                  * When leaf 0BH or 1FH is defined, CL is pass-through
1018                  * and EDX is always the x2APIC ID, even for undefined
1019                  * subleaves. Index 1 will exist iff the leaf is
1020                  * implemented, so we pass through CL iff leaf 1
1021                  * exists. EDX can be copied from any existing index.
1022                  */
1023                 if (function == 0xb || function == 0x1f) {
1024                         entry = kvm_find_cpuid_entry(vcpu, function, 1);
1025                         if (entry) {
1026                                 *ecx = index & 0xff;
1027                                 *edx = entry->edx;
1028                         }
1029                 }
1030         }
1031         trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
1032                         used_max_basic);
1033         return exact;
1034 }
1035 EXPORT_SYMBOL_GPL(kvm_cpuid);
1036
1037 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1038 {
1039         u32 eax, ebx, ecx, edx;
1040
1041         if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
1042                 return 1;
1043
1044         eax = kvm_rax_read(vcpu);
1045         ecx = kvm_rcx_read(vcpu);
1046         kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1047         kvm_rax_write(vcpu, eax);
1048         kvm_rbx_write(vcpu, ebx);
1049         kvm_rcx_write(vcpu, ecx);
1050         kvm_rdx_write(vcpu, edx);
1051         return kvm_skip_emulated_instruction(vcpu);
1052 }
1053 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);