arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44
  45 #include <asm/processor.h>
  46 #include <asm/traps.h>
  47 #include <asm/tlbflush.h>
  48 #include <asm/mce.h>
  49 #include <asm/msr.h>
  50
  51 #include "mce-internal.h"
  52
  53 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  54
  55 #define rcu_dereference_check_mce(p) \
  56 ({ \
  57         rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  58                            lockdep_is_held(&mce_chrdev_read_mutex), \
  59                            "suspicious rcu_dereference_check_mce() usage"); \
  60         smp_load_acquire(&(p)); \
  61 })
  62
  63 #define CREATE_TRACE_POINTS
  64 #include <trace/events/mce.h>
  65
  66 #define SPINUNIT                100     /* 100ns */
  67
  68 DEFINE_PER_CPU(unsigned, mce_exception_count);
  69
  70 struct mce_bank *mce_banks __read_mostly;
  71 struct mce_vendor_flags mce_flags __read_mostly;
  72
  73 struct mca_config mca_cfg __read_mostly = {
  74         .bootlog  = -1,
  75         /*
  76          * Tolerant levels:
  77          * 0: always panic on uncorrected errors, log corrected errors
  78          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  79          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  80          * 3: never panic or SIGBUS, log all errors (for testing only)
  81          */
  82         .tolerant = 1,
  83         .monarch_timeout = -1
  84 };
  85
  86 /* User mode helper program triggered by machine check event */
  87 static unsigned long            mce_need_notify;
  88 static char                     mce_helper[128];
  89 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  90
  91 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  92
  93 static DEFINE_PER_CPU(struct mce, mces_seen);
  94 static int                      cpu_missing;
  95
  96 /*
  97  * MCA banks polled by the period polling timer for corrected events.
  98  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  99  */
 100 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 101         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 102 };
 103
 104 /*
 105  * MCA banks controlled through firmware first for corrected errors.
 106  * This is a global list of banks for which we won't enable CMCI and we
 107  * won't poll. Firmware controls these banks and is responsible for
 108  * reporting corrected errors through GHES. Uncorrected/recoverable
 109  * errors are still notified through a machine check.
 110  */
 111 mce_banks_t mce_banks_ce_disabled;
 112
 113 static DEFINE_PER_CPU(struct work_struct, mce_work);
 114
 115 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 116
 117 /*
 118  * CPU/chipset specific EDAC code can register a notifier call here to print
 119  * MCE errors in a human-readable form.
 120  */
 121 static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 122
 123 /* Do initial initialization of a struct mce */
 124 void mce_setup(struct mce *m)
 125 {
 126         memset(m, 0, sizeof(struct mce));
 127         m->cpu = m->extcpu = smp_processor_id();
 128         rdtscll(m->tsc);
 129         /* We hope get_seconds stays lockless */
 130         m->time = get_seconds();
 131         m->cpuvendor = boot_cpu_data.x86_vendor;
 132         m->cpuid = cpuid_eax(1);
 133         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 134         m->apicid = cpu_data(m->extcpu).initial_apicid;
 135         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 136 }
 137
 138 DEFINE_PER_CPU(struct mce, injectm);
 139 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 140
 141 /*
 142  * Lockless MCE logging infrastructure.
 143  * This avoids deadlocks on printk locks without having to break locks. Also
 144  * separate MCEs from kernel messages to avoid bogus bug reports.
 145  */
 146
 147 static struct mce_log mcelog = {
 148         .signature      = MCE_LOG_SIGNATURE,
 149         .len            = MCE_LOG_LEN,
 150         .recordlen      = sizeof(struct mce),
 151 };
 152
 153 void mce_log(struct mce *mce)
 154 {
 155         unsigned next, entry;
 156
 157         /* Emit the trace record: */
 158         trace_mce_record(mce);
 159
 160         atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 161
 162         mce->finished = 0;
 163         wmb();
 164         for (;;) {
 165                 entry = rcu_dereference_check_mce(mcelog.next);
 166                 for (;;) {
 167
 168                         /*
 169                          * When the buffer fills up discard new entries.
 170                          * Assume that the earlier errors are the more
 171                          * interesting ones:
 172                          */
 173                         if (entry >= MCE_LOG_LEN) {
 174                                 set_bit(MCE_OVERFLOW,
 175                                         (unsigned long *)&mcelog.flags);
 176                                 return;
 177                         }
 178                         /* Old left over entry. Skip: */
 179                         if (mcelog.entry[entry].finished) {
 180                                 entry++;
 181                                 continue;
 182                         }
 183                         break;
 184                 }
 185                 smp_rmb();
 186                 next = entry + 1;
 187                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 188                         break;
 189         }
 190         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 191         wmb();
 192         mcelog.entry[entry].finished = 1;
 193         wmb();
 194
 195         mce->finished = 1;
 196         set_bit(0, &mce_need_notify);
 197 }
 198
 199 static void drain_mcelog_buffer(void)
 200 {
 201         unsigned int next, i, prev = 0;
 202
 203         next = ACCESS_ONCE(mcelog.next);
 204
 205         do {
 206                 struct mce *m;
 207
 208                 /* drain what was logged during boot */
 209                 for (i = prev; i < next; i++) {
 210                         unsigned long start = jiffies;
 211                         unsigned retries = 1;
 212
 213                         m = &mcelog.entry[i];
 214
 215                         while (!m->finished) {
 216                                 if (time_after_eq(jiffies, start + 2*retries))
 217                                         retries++;
 218
 219                                 cpu_relax();
 220
 221                                 if (!m->finished && retries >= 4) {
 222                                         pr_err("skipping error being logged currently!\n");
 223                                         break;
 224                                 }
 225                         }
 226                         smp_rmb();
 227                         atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 228                 }
 229
 230                 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
 231                 prev = next;
 232                 next = cmpxchg(&mcelog.next, prev, 0);
 233         } while (next != prev);
 234 }
 235
 236
 237 void mce_register_decode_chain(struct notifier_block *nb)
 238 {
 239         atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 240         drain_mcelog_buffer();
 241 }
 242 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 243
 244 void mce_unregister_decode_chain(struct notifier_block *nb)
 245 {
 246         atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 247 }
 248 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 249
 250 static void print_mce(struct mce *m)
 251 {
 252         int ret = 0;
 253
 254         pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 255                m->extcpu, m->mcgstatus, m->bank, m->status);
 256
 257         if (m->ip) {
 258                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 259                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 260                                 m->cs, m->ip);
 261
 262                 if (m->cs == __KERNEL_CS)
 263                         print_symbol("{%s}", m->ip);
 264                 pr_cont("\n");
 265         }
 266
 267         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 268         if (m->addr)
 269                 pr_cont("ADDR %llx ", m->addr);
 270         if (m->misc)
 271                 pr_cont("MISC %llx ", m->misc);
 272
 273         pr_cont("\n");
 274         /*
 275          * Note this output is parsed by external tools and old fields
 276          * should not be changed.
 277          */
 278         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 279                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 280                 cpu_data(m->extcpu).microcode);
 281
 282         /*
 283          * Print out human-readable details about the MCE error,
 284          * (if the CPU has an implementation for that)
 285          */
 286         ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 287         if (ret == NOTIFY_STOP)
 288                 return;
 289
 290         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 291 }
 292
 293 #define PANIC_TIMEOUT 5 /* 5 seconds */
 294
 295 static atomic_t mce_panicked;
 296
 297 static int fake_panic;
 298 static atomic_t mce_fake_panicked;
 299
 300 /* Panic in progress. Enable interrupts and wait for final IPI */
 301 static void wait_for_panic(void)
 302 {
 303         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 304
 305         preempt_disable();
 306         local_irq_enable();
 307         while (timeout-- > 0)
 308                 udelay(1);
 309         if (panic_timeout == 0)
 310                 panic_timeout = mca_cfg.panic_timeout;
 311         panic("Panicing machine check CPU died");
 312 }
 313
 314 static void mce_panic(const char *msg, struct mce *final, char *exp)
 315 {
 316         int i, apei_err = 0;
 317
 318         if (!fake_panic) {
 319                 /*
 320                  * Make sure only one CPU runs in machine check panic
 321                  */
 322                 if (atomic_inc_return(&mce_panicked) > 1)
 323                         wait_for_panic();
 324                 barrier();
 325
 326                 bust_spinlocks(1);
 327                 console_verbose();
 328         } else {
 329                 /* Don't log too much for fake panic */
 330                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 331                         return;
 332         }
 333         /* First print corrected ones that are still unlogged */
 334         for (i = 0; i < MCE_LOG_LEN; i++) {
 335                 struct mce *m = &mcelog.entry[i];
 336                 if (!(m->status & MCI_STATUS_VAL))
 337                         continue;
 338                 if (!(m->status & MCI_STATUS_UC)) {
 339                         print_mce(m);
 340                         if (!apei_err)
 341                                 apei_err = apei_write_mce(m);
 342                 }
 343         }
 344         /* Now print uncorrected but with the final one last */
 345         for (i = 0; i < MCE_LOG_LEN; i++) {
 346                 struct mce *m = &mcelog.entry[i];
 347                 if (!(m->status & MCI_STATUS_VAL))
 348                         continue;
 349                 if (!(m->status & MCI_STATUS_UC))
 350                         continue;
 351                 if (!final || memcmp(m, final, sizeof(struct mce))) {
 352                         print_mce(m);
 353                         if (!apei_err)
 354                                 apei_err = apei_write_mce(m);
 355                 }
 356         }
 357         if (final) {
 358                 print_mce(final);
 359                 if (!apei_err)
 360                         apei_err = apei_write_mce(final);
 361         }
 362         if (cpu_missing)
 363                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 364         if (exp)
 365                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 366         if (!fake_panic) {
 367                 if (panic_timeout == 0)
 368                         panic_timeout = mca_cfg.panic_timeout;
 369                 panic(msg);
 370         } else
 371                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 372 }
 373
 374 /* Support code for software error injection */
 375
 376 static int msr_to_offset(u32 msr)
 377 {
 378         unsigned bank = __this_cpu_read(injectm.bank);
 379
 380         if (msr == mca_cfg.rip_msr)
 381                 return offsetof(struct mce, ip);
 382         if (msr == MSR_IA32_MCx_STATUS(bank))
 383                 return offsetof(struct mce, status);
 384         if (msr == MSR_IA32_MCx_ADDR(bank))
 385                 return offsetof(struct mce, addr);
 386         if (msr == MSR_IA32_MCx_MISC(bank))
 387                 return offsetof(struct mce, misc);
 388         if (msr == MSR_IA32_MCG_STATUS)
 389                 return offsetof(struct mce, mcgstatus);
 390         return -1;
 391 }
 392
 393 /* MSR access wrappers used for error injection */
 394 static u64 mce_rdmsrl(u32 msr)
 395 {
 396         u64 v;
 397
 398         if (__this_cpu_read(injectm.finished)) {
 399                 int offset = msr_to_offset(msr);
 400
 401                 if (offset < 0)
 402                         return 0;
 403                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 404         }
 405
 406         if (rdmsrl_safe(msr, &v)) {
 407                 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 408                 /*
 409                  * Return zero in case the access faulted. This should
 410                  * not happen normally but can happen if the CPU does
 411                  * something weird, or if the code is buggy.
 412                  */
 413                 v = 0;
 414         }
 415
 416         return v;
 417 }
 418
 419 static void mce_wrmsrl(u32 msr, u64 v)
 420 {
 421         if (__this_cpu_read(injectm.finished)) {
 422                 int offset = msr_to_offset(msr);
 423
 424                 if (offset >= 0)
 425                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 426                 return;
 427         }
 428         wrmsrl(msr, v);
 429 }
 430
 431 /*
 432  * Collect all global (w.r.t. this processor) status about this machine
 433  * check into our "mce" struct so that we can use it later to assess
 434  * the severity of the problem as we read per-bank specific details.
 435  */
 436 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 437 {
 438         mce_setup(m);
 439
 440         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 441         if (regs) {
 442                 /*
 443                  * Get the address of the instruction at the time of
 444                  * the machine check error.
 445                  */
 446                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 447                         m->ip = regs->ip;
 448                         m->cs = regs->cs;
 449
 450                         /*
 451                          * When in VM86 mode make the cs look like ring 3
 452                          * always. This is a lie, but it's better than passing
 453                          * the additional vm86 bit around everywhere.
 454                          */
 455                         if (v8086_mode(regs))
 456                                 m->cs |= 3;
 457                 }
 458                 /* Use accurate RIP reporting if available. */
 459                 if (mca_cfg.rip_msr)
 460                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 461         }
 462 }
 463
 464 /*
 465  * Simple lockless ring to communicate PFNs from the exception handler with the
 466  * process context work function. This is vastly simplified because there's
 467  * only a single reader and a single writer.
 468  */
 469 #define MCE_RING_SIZE 16        /* we use one entry less */
 470
 471 struct mce_ring {
 472         unsigned short start;
 473         unsigned short end;
 474         unsigned long ring[MCE_RING_SIZE];
 475 };
 476 static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 477
 478 /* Runs with CPU affinity in workqueue */
 479 static int mce_ring_empty(void)
 480 {
 481         struct mce_ring *r = this_cpu_ptr(&mce_ring);
 482
 483         return r->start == r->end;
 484 }
 485
 486 static int mce_ring_get(unsigned long *pfn)
 487 {
 488         struct mce_ring *r;
 489         int ret = 0;
 490
 491         *pfn = 0;
 492         get_cpu();
 493         r = this_cpu_ptr(&mce_ring);
 494         if (r->start == r->end)
 495                 goto out;
 496         *pfn = r->ring[r->start];
 497         r->start = (r->start + 1) % MCE_RING_SIZE;
 498         ret = 1;
 499 out:
 500         put_cpu();
 501         return ret;
 502 }
 503
 504 /* Always runs in MCE context with preempt off */
 505 static int mce_ring_add(unsigned long pfn)
 506 {
 507         struct mce_ring *r = this_cpu_ptr(&mce_ring);
 508         unsigned next;
 509
 510         next = (r->end + 1) % MCE_RING_SIZE;
 511         if (next == r->start)
 512                 return -1;
 513         r->ring[r->end] = pfn;
 514         wmb();
 515         r->end = next;
 516         return 0;
 517 }
 518
 519 int mce_available(struct cpuinfo_x86 *c)
 520 {
 521         if (mca_cfg.disabled)
 522                 return 0;
 523         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 524 }
 525
 526 static void mce_schedule_work(void)
 527 {
 528         if (!mce_ring_empty())
 529                 schedule_work(this_cpu_ptr(&mce_work));
 530 }
 531
 532 static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 533
 534 static void mce_irq_work_cb(struct irq_work *entry)
 535 {
 536         mce_notify_irq();
 537         mce_schedule_work();
 538 }
 539
 540 static void mce_report_event(struct pt_regs *regs)
 541 {
 542         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 543                 mce_notify_irq();
 544                 /*
 545                  * Triggering the work queue here is just an insurance
 546                  * policy in case the syscall exit notify handler
 547                  * doesn't run soon enough or ends up running on the
 548                  * wrong CPU (can happen when audit sleeps)
 549                  */
 550                 mce_schedule_work();
 551                 return;
 552         }
 553
 554         irq_work_queue(this_cpu_ptr(&mce_irq_work));
 555 }
 556
 557 /*
 558  * Read ADDR and MISC registers.
 559  */
 560 static void mce_read_aux(struct mce *m, int i)
 561 {
 562         if (m->status & MCI_STATUS_MISCV)
 563                 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 564         if (m->status & MCI_STATUS_ADDRV) {
 565                 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 566
 567                 /*
 568                  * Mask the reported address by the reported granularity.
 569                  */
 570                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 571                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 572                         m->addr >>= shift;
 573                         m->addr <<= shift;
 574                 }
 575         }
 576 }
 577
 578 static bool memory_error(struct mce *m)
 579 {
 580         struct cpuinfo_x86 *c = &boot_cpu_data;
 581
 582         if (c->x86_vendor == X86_VENDOR_AMD) {
 583                 /*
 584                  * coming soon
 585                  */
 586                 return false;
 587         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 588                 /*
 589                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 590                  *
 591                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 592                  * indicating a memory error. Bit 8 is used for indicating a
 593                  * cache hierarchy error. The combination of bit 2 and bit 3
 594                  * is used for indicating a `generic' cache hierarchy error
 595                  * But we can't just blindly check the above bits, because if
 596                  * bit 11 is set, then it is a bus/interconnect error - and
 597                  * either way the above bits just gives more detail on what
 598                  * bus/interconnect error happened. Note that bit 12 can be
 599                  * ignored, as it's the "filter" bit.
 600                  */
 601                 return (m->status & 0xef80) == BIT(7) ||
 602                        (m->status & 0xef00) == BIT(8) ||
 603                        (m->status & 0xeffc) == 0xc;
 604         }
 605
 606         return false;
 607 }
 608
 609 DEFINE_PER_CPU(unsigned, mce_poll_count);
 610
 611 /*
 612  * Poll for corrected events or events that happened before reset.
 613  * Those are just logged through /dev/mcelog.
 614  *
 615  * This is executed in standard interrupt context.
 616  *
 617  * Note: spec recommends to panic for fatal unsignalled
 618  * errors here. However this would be quite problematic --
 619  * we would need to reimplement the Monarch handling and
 620  * it would mess up the exclusion between exception handler
 621  * and poll hander -- * so we skip this for now.
 622  * These cases should not happen anyways, or only when the CPU
 623  * is already totally * confused. In this case it's likely it will
 624  * not fully execute the machine check handler either.
 625  */
 626 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 627 {
 628         bool error_logged = false;
 629         struct mce m;
 630         int severity;
 631         int i;
 632
 633         this_cpu_inc(mce_poll_count);
 634
 635         mce_gather_info(&m, NULL);
 636
 637         for (i = 0; i < mca_cfg.banks; i++) {
 638                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 639                         continue;
 640
 641                 m.misc = 0;
 642                 m.addr = 0;
 643                 m.bank = i;
 644                 m.tsc = 0;
 645
 646                 barrier();
 647                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 648                 if (!(m.status & MCI_STATUS_VAL))
 649                         continue;
 650
 651
 652                 /*
 653                  * Uncorrected or signalled events are handled by the exception
 654                  * handler when it is enabled, so don't process those here.
 655                  *
 656                  * TBD do the same check for MCI_STATUS_EN here?
 657                  */
 658                 if (!(flags & MCP_UC) &&
 659                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 660                         continue;
 661
 662                 mce_read_aux(&m, i);
 663
 664                 if (!(flags & MCP_TIMESTAMP))
 665                         m.tsc = 0;
 666
 667                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 668
 669                 /*
 670                  * In the cases where we don't have a valid address after all,
 671                  * do not add it into the ring buffer.
 672                  */
 673                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 674                         if (m.status & MCI_STATUS_ADDRV) {
 675                                 mce_ring_add(m.addr >> PAGE_SHIFT);
 676                                 mce_schedule_work();
 677                         }
 678                 }
 679
 680                 /*
 681                  * Don't get the IP here because it's unlikely to
 682                  * have anything to do with the actual error location.
 683                  */
 684                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
 685                         error_logged = true;
 686                         mce_log(&m);
 687                 }
 688
 689                 /*
 690                  * Clear state for this bank.
 691                  */
 692                 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 693         }
 694
 695         /*
 696          * Don't clear MCG_STATUS here because it's only defined for
 697          * exceptions.
 698          */
 699
 700         sync_core();
 701
 702         return error_logged;
 703 }
 704 EXPORT_SYMBOL_GPL(machine_check_poll);
 705
 706 /*
 707  * Do a quick check if any of the events requires a panic.
 708  * This decides if we keep the events around or clear them.
 709  */
 710 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 711                           struct pt_regs *regs)
 712 {
 713         int i, ret = 0;
 714         char *tmp;
 715
 716         for (i = 0; i < mca_cfg.banks; i++) {
 717                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 718                 if (m->status & MCI_STATUS_VAL) {
 719                         __set_bit(i, validp);
 720                         if (quirk_no_way_out)
 721                                 quirk_no_way_out(i, m, regs);
 722                 }
 723
 724                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 725                         *msg = tmp;
 726                         ret = 1;
 727                 }
 728         }
 729         return ret;
 730 }
 731
 732 /*
 733  * Variable to establish order between CPUs while scanning.
 734  * Each CPU spins initially until executing is equal its number.
 735  */
 736 static atomic_t mce_executing;
 737
 738 /*
 739  * Defines order of CPUs on entry. First CPU becomes Monarch.
 740  */
 741 static atomic_t mce_callin;
 742
 743 /*
 744  * Check if a timeout waiting for other CPUs happened.
 745  */
 746 static int mce_timed_out(u64 *t, const char *msg)
 747 {
 748         /*
 749          * The others already did panic for some reason.
 750          * Bail out like in a timeout.
 751          * rmb() to tell the compiler that system_state
 752          * might have been modified by someone else.
 753          */
 754         rmb();
 755         if (atomic_read(&mce_panicked))
 756                 wait_for_panic();
 757         if (!mca_cfg.monarch_timeout)
 758                 goto out;
 759         if ((s64)*t < SPINUNIT) {
 760                 if (mca_cfg.tolerant <= 1)
 761                         mce_panic(msg, NULL, NULL);
 762                 cpu_missing = 1;
 763                 return 1;
 764         }
 765         *t -= SPINUNIT;
 766 out:
 767         touch_nmi_watchdog();
 768         return 0;
 769 }
 770
 771 /*
 772  * The Monarch's reign.  The Monarch is the CPU who entered
 773  * the machine check handler first. It waits for the others to
 774  * raise the exception too and then grades them. When any
 775  * error is fatal panic. Only then let the others continue.
 776  *
 777  * The other CPUs entering the MCE handler will be controlled by the
 778  * Monarch. They are called Subjects.
 779  *
 780  * This way we prevent any potential data corruption in a unrecoverable case
 781  * and also makes sure always all CPU's errors are examined.
 782  *
 783  * Also this detects the case of a machine check event coming from outer
 784  * space (not detected by any CPUs) In this case some external agent wants
 785  * us to shut down, so panic too.
 786  *
 787  * The other CPUs might still decide to panic if the handler happens
 788  * in a unrecoverable place, but in this case the system is in a semi-stable
 789  * state and won't corrupt anything by itself. It's ok to let the others
 790  * continue for a bit first.
 791  *
 792  * All the spin loops have timeouts; when a timeout happens a CPU
 793  * typically elects itself to be Monarch.
 794  */
 795 static void mce_reign(void)
 796 {
 797         int cpu;
 798         struct mce *m = NULL;
 799         int global_worst = 0;
 800         char *msg = NULL;
 801         char *nmsg = NULL;
 802
 803         /*
 804          * This CPU is the Monarch and the other CPUs have run
 805          * through their handlers.
 806          * Grade the severity of the errors of all the CPUs.
 807          */
 808         for_each_possible_cpu(cpu) {
 809                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 810                                             mca_cfg.tolerant,
 811                                             &nmsg, true);
 812                 if (severity > global_worst) {
 813                         msg = nmsg;
 814                         global_worst = severity;
 815                         m = &per_cpu(mces_seen, cpu);
 816                 }
 817         }
 818
 819         /*
 820          * Cannot recover? Panic here then.
 821          * This dumps all the mces in the log buffer and stops the
 822          * other CPUs.
 823          */
 824         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 825                 mce_panic("Fatal machine check", m, msg);
 826
 827         /*
 828          * For UC somewhere we let the CPU who detects it handle it.
 829          * Also must let continue the others, otherwise the handling
 830          * CPU could deadlock on a lock.
 831          */
 832
 833         /*
 834          * No machine check event found. Must be some external
 835          * source or one CPU is hung. Panic.
 836          */
 837         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 838                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 839
 840         /*
 841          * Now clear all the mces_seen so that they don't reappear on
 842          * the next mce.
 843          */
 844         for_each_possible_cpu(cpu)
 845                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 846 }
 847
 848 static atomic_t global_nwo;
 849
 850 /*
 851  * Start of Monarch synchronization. This waits until all CPUs have
 852  * entered the exception handler and then determines if any of them
 853  * saw a fatal event that requires panic. Then it executes them
 854  * in the entry order.
 855  * TBD double check parallel CPU hotunplug
 856  */
 857 static int mce_start(int *no_way_out)
 858 {
 859         int order;
 860         int cpus = num_online_cpus();
 861         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 862
 863         if (!timeout)
 864                 return -1;
 865
 866         atomic_add(*no_way_out, &global_nwo);
 867         /*
 868          * global_nwo should be updated before mce_callin
 869          */
 870         smp_wmb();
 871         order = atomic_inc_return(&mce_callin);
 872
 873         /*
 874          * Wait for everyone.
 875          */
 876         while (atomic_read(&mce_callin) != cpus) {
 877                 if (mce_timed_out(&timeout,
 878                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 879                         atomic_set(&global_nwo, 0);
 880                         return -1;
 881                 }
 882                 ndelay(SPINUNIT);
 883         }
 884
 885         /*
 886          * mce_callin should be read before global_nwo
 887          */
 888         smp_rmb();
 889
 890         if (order == 1) {
 891                 /*
 892                  * Monarch: Starts executing now, the others wait.
 893                  */
 894                 atomic_set(&mce_executing, 1);
 895         } else {
 896                 /*
 897                  * Subject: Now start the scanning loop one by one in
 898                  * the original callin order.
 899                  * This way when there are any shared banks it will be
 900                  * only seen by one CPU before cleared, avoiding duplicates.
 901                  */
 902                 while (atomic_read(&mce_executing) < order) {
 903                         if (mce_timed_out(&timeout,
 904                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 905                                 atomic_set(&global_nwo, 0);
 906                                 return -1;
 907                         }
 908                         ndelay(SPINUNIT);
 909                 }
 910         }
 911
 912         /*
 913          * Cache the global no_way_out state.
 914          */
 915         *no_way_out = atomic_read(&global_nwo);
 916
 917         return order;
 918 }
 919
 920 /*
 921  * Synchronize between CPUs after main scanning loop.
 922  * This invokes the bulk of the Monarch processing.
 923  */
 924 static int mce_end(int order)
 925 {
 926         int ret = -1;
 927         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 928
 929         if (!timeout)
 930                 goto reset;
 931         if (order < 0)
 932                 goto reset;
 933
 934         /*
 935          * Allow others to run.
 936          */
 937         atomic_inc(&mce_executing);
 938
 939         if (order == 1) {
 940                 /* CHECKME: Can this race with a parallel hotplug? */
 941                 int cpus = num_online_cpus();
 942
 943                 /*
 944                  * Monarch: Wait for everyone to go through their scanning
 945                  * loops.
 946                  */
 947                 while (atomic_read(&mce_executing) <= cpus) {
 948                         if (mce_timed_out(&timeout,
 949                                           "Timeout: Monarch CPU unable to finish machine check processing"))
 950                                 goto reset;
 951                         ndelay(SPINUNIT);
 952                 }
 953
 954                 mce_reign();
 955                 barrier();
 956                 ret = 0;
 957         } else {
 958                 /*
 959                  * Subject: Wait for Monarch to finish.
 960                  */
 961                 while (atomic_read(&mce_executing) != 0) {
 962                         if (mce_timed_out(&timeout,
 963                                           "Timeout: Monarch CPU did not finish machine check processing"))
 964                                 goto reset;
 965                         ndelay(SPINUNIT);
 966                 }
 967
 968                 /*
 969                  * Don't reset anything. That's done by the Monarch.
 970                  */
 971                 return 0;
 972         }
 973
 974         /*
 975          * Reset all global state.
 976          */
 977 reset:
 978         atomic_set(&global_nwo, 0);
 979         atomic_set(&mce_callin, 0);
 980         barrier();
 981
 982         /*
 983          * Let others run again.
 984          */
 985         atomic_set(&mce_executing, 0);
 986         return ret;
 987 }
 988
 989 /*
 990  * Check if the address reported by the CPU is in a format we can parse.
 991  * It would be possible to add code for most other cases, but all would
 992  * be somewhat complicated (e.g. segment offset would require an instruction
 993  * parser). So only support physical addresses up to page granuality for now.
 994  */
 995 static int mce_usable_address(struct mce *m)
 996 {
 997         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 998                 return 0;
 999         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
1000                 return 0;
1001         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
1002                 return 0;
1003         return 1;
1004 }
1005
1006 static void mce_clear_state(unsigned long *toclear)
1007 {
1008         int i;
1009
1010         for (i = 0; i < mca_cfg.banks; i++) {
1011                 if (test_bit(i, toclear))
1012                         mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1013         }
1014 }
1015
1016 /*
1017  * The actual machine check handler. This only handles real
1018  * exceptions when something got corrupted coming in through int 18.
1019  *
1020  * This is executed in NMI context not subject to normal locking rules. This
1021  * implies that most kernel services cannot be safely used. Don't even
1022  * think about putting a printk in there!
1023  *
1024  * On Intel systems this is entered on all CPUs in parallel through
1025  * MCE broadcast. However some CPUs might be broken beyond repair,
1026  * so be always careful when synchronizing with others.
1027  */
1028 void do_machine_check(struct pt_regs *regs, long error_code)
1029 {
1030         struct mca_config *cfg = &mca_cfg;
1031         struct mce m, *final;
1032         enum ctx_state prev_state;
1033         int i;
1034         int worst = 0;
1035         int severity;
1036         /*
1037          * Establish sequential order between the CPUs entering the machine
1038          * check handler.
1039          */
1040         int order;
1041         /*
1042          * If no_way_out gets set, there is no safe way to recover from this
1043          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1044          */
1045         int no_way_out = 0;
1046         /*
1047          * If kill_it gets set, there might be a way to recover from this
1048          * error.
1049          */
1050         int kill_it = 0;
1051         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1052         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1053         char *msg = "Unknown";
1054         u64 recover_paddr = ~0ull;
1055         int flags = MF_ACTION_REQUIRED;
1056         int lmce = 0;
1057
1058         prev_state = ist_enter(regs);
1059
1060         this_cpu_inc(mce_exception_count);
1061
1062         if (!cfg->banks)
1063                 goto out;
1064
1065         mce_gather_info(&m, regs);
1066
1067         final = this_cpu_ptr(&mces_seen);
1068         *final = m;
1069
1070         memset(valid_banks, 0, sizeof(valid_banks));
1071         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1072
1073         barrier();
1074
1075         /*
1076          * When no restart IP might need to kill or panic.
1077          * Assume the worst for now, but if we find the
1078          * severity is MCE_AR_SEVERITY we have other options.
1079          */
1080         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1081                 kill_it = 1;
1082
1083         /*
1084          * Check if this MCE is signaled to only this logical processor
1085          */
1086         if (m.mcgstatus & MCG_STATUS_LMCES)
1087                 lmce = 1;
1088         else {
1089                 /*
1090                  * Go through all the banks in exclusion of the other CPUs.
1091                  * This way we don't report duplicated events on shared banks
1092                  * because the first one to see it will clear it.
1093                  * If this is a Local MCE, then no need to perform rendezvous.
1094                  */
1095                 order = mce_start(&no_way_out);
1096         }
1097
1098         for (i = 0; i < cfg->banks; i++) {
1099                 __clear_bit(i, toclear);
1100                 if (!test_bit(i, valid_banks))
1101                         continue;
1102                 if (!mce_banks[i].ctl)
1103                         continue;
1104
1105                 m.misc = 0;
1106                 m.addr = 0;
1107                 m.bank = i;
1108
1109                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1110                 if ((m.status & MCI_STATUS_VAL) == 0)
1111                         continue;
1112
1113                 /*
1114                  * Non uncorrected or non signaled errors are handled by
1115                  * machine_check_poll. Leave them alone, unless this panics.
1116                  */
1117                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1118                         !no_way_out)
1119                         continue;
1120
1121                 /*
1122                  * Set taint even when machine check was not enabled.
1123                  */
1124                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1125
1126                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1127
1128                 /*
1129                  * When machine check was for corrected/deferred handler don't
1130                  * touch, unless we're panicing.
1131                  */
1132                 if ((severity == MCE_KEEP_SEVERITY ||
1133                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1134                         continue;
1135                 __set_bit(i, toclear);
1136                 if (severity == MCE_NO_SEVERITY) {
1137                         /*
1138                          * Machine check event was not enabled. Clear, but
1139                          * ignore.
1140                          */
1141                         continue;
1142                 }
1143
1144                 mce_read_aux(&m, i);
1145
1146                 /*
1147                  * Action optional error. Queue address for later processing.
1148                  * When the ring overflows we just ignore the AO error.
1149                  * RED-PEN add some logging mechanism when
1150                  * usable_address or mce_add_ring fails.
1151                  * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1152                  */
1153                 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1154                         mce_ring_add(m.addr >> PAGE_SHIFT);
1155
1156                 mce_log(&m);
1157
1158                 if (severity > worst) {
1159                         *final = m;
1160                         worst = severity;
1161                 }
1162         }
1163
1164         /* mce_clear_state will clear *final, save locally for use later */
1165         m = *final;
1166
1167         if (!no_way_out)
1168                 mce_clear_state(toclear);
1169
1170         /*
1171          * Do most of the synchronization with other CPUs.
1172          * When there's any problem use only local no_way_out state.
1173          */
1174         if (!lmce) {
1175                 if (mce_end(order) < 0)
1176                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1177         } else {
1178                 /*
1179                  * Local MCE skipped calling mce_reign()
1180                  * If we found a fatal error, we need to panic here.
1181                  */
1182                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1183                         mce_panic("Machine check from unknown source",
1184                                 NULL, NULL);
1185         }
1186
1187         /*
1188          * At insane "tolerant" levels we take no action. Otherwise
1189          * we only die if we have no other choice. For less serious
1190          * issues we try to recover, or limit damage to the current
1191          * process.
1192          */
1193         if (cfg->tolerant < 3) {
1194                 if (no_way_out)
1195                         mce_panic("Fatal machine check on current CPU", &m, msg);
1196                 if (worst == MCE_AR_SEVERITY) {
1197                         recover_paddr = m.addr;
1198                         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1199                                 flags |= MF_MUST_KILL;
1200                 } else if (kill_it) {
1201                         force_sig(SIGBUS, current);
1202                 }
1203         }
1204
1205         if (worst > 0)
1206                 mce_report_event(regs);
1207         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1208 out:
1209         sync_core();
1210
1211         if (recover_paddr == ~0ull)
1212                 goto done;
1213
1214         pr_err("Uncorrected hardware memory error in user-access at %llx",
1215                  recover_paddr);
1216         /*
1217          * We must call memory_failure() here even if the current process is
1218          * doomed. We still need to mark the page as poisoned and alert any
1219          * other users of the page.
1220          */
1221         ist_begin_non_atomic(regs);
1222         local_irq_enable();
1223         if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1224                 pr_err("Memory error not recovered");
1225                 force_sig(SIGBUS, current);
1226         }
1227         local_irq_disable();
1228         ist_end_non_atomic();
1229 done:
1230         ist_exit(regs, prev_state);
1231 }
1232 EXPORT_SYMBOL_GPL(do_machine_check);
1233
1234 #ifndef CONFIG_MEMORY_FAILURE
1235 int memory_failure(unsigned long pfn, int vector, int flags)
1236 {
1237         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1238         BUG_ON(flags & MF_ACTION_REQUIRED);
1239         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1240                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1241                pfn);
1242
1243         return 0;
1244 }
1245 #endif
1246
1247 /*
1248  * Action optional processing happens here (picking up
1249  * from the list of faulting pages that do_machine_check()
1250  * placed into the "ring").
1251  */
1252 static void mce_process_work(struct work_struct *dummy)
1253 {
1254         unsigned long pfn;
1255
1256         while (mce_ring_get(&pfn))
1257                 memory_failure(pfn, MCE_VECTOR, 0);
1258 }
1259
1260 #ifdef CONFIG_X86_MCE_INTEL
1261 /***
1262  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1263  * @cpu: The CPU on which the event occurred.
1264  * @status: Event status information
1265  *
1266  * This function should be called by the thermal interrupt after the
1267  * event has been processed and the decision was made to log the event
1268  * further.
1269  *
1270  * The status parameter will be saved to the 'status' field of 'struct mce'
1271  * and historically has been the register value of the
1272  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1273  */
1274 void mce_log_therm_throt_event(__u64 status)
1275 {
1276         struct mce m;
1277
1278         mce_setup(&m);
1279         m.bank = MCE_THERMAL_BANK;
1280         m.status = status;
1281         mce_log(&m);
1282 }
1283 #endif /* CONFIG_X86_MCE_INTEL */
1284
1285 /*
1286  * Periodic polling timer for "silent" machine check errors.  If the
1287  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1288  * errors, poll 2x slower (up to check_interval seconds).
1289  */
1290 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1291
1292 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1293 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1294
1295 static unsigned long mce_adjust_timer_default(unsigned long interval)
1296 {
1297         return interval;
1298 }
1299
1300 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1301
1302 static void __restart_timer(struct timer_list *t, unsigned long interval)
1303 {
1304         unsigned long when = jiffies + interval;
1305         unsigned long flags;
1306
1307         local_irq_save(flags);
1308
1309         if (timer_pending(t)) {
1310                 if (time_before(when, t->expires))
1311                         mod_timer_pinned(t, when);
1312         } else {
1313                 t->expires = round_jiffies(when);
1314                 add_timer_on(t, smp_processor_id());
1315         }
1316
1317         local_irq_restore(flags);
1318 }
1319
1320 static void mce_timer_fn(unsigned long data)
1321 {
1322         struct timer_list *t = this_cpu_ptr(&mce_timer);
1323         int cpu = smp_processor_id();
1324         unsigned long iv;
1325
1326         WARN_ON(cpu != data);
1327
1328         iv = __this_cpu_read(mce_next_interval);
1329
1330         if (mce_available(this_cpu_ptr(&cpu_info))) {
1331                 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1332
1333                 if (mce_intel_cmci_poll()) {
1334                         iv = mce_adjust_timer(iv);
1335                         goto done;
1336                 }
1337         }
1338
1339         /*
1340          * Alert userspace if needed. If we logged an MCE, reduce the polling
1341          * interval, otherwise increase the polling interval.
1342          */
1343         if (mce_notify_irq())
1344                 iv = max(iv / 2, (unsigned long) HZ/100);
1345         else
1346                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1347
1348 done:
1349         __this_cpu_write(mce_next_interval, iv);
1350         __restart_timer(t, iv);
1351 }
1352
1353 /*
1354  * Ensure that the timer is firing in @interval from now.
1355  */
1356 void mce_timer_kick(unsigned long interval)
1357 {
1358         struct timer_list *t = this_cpu_ptr(&mce_timer);
1359         unsigned long iv = __this_cpu_read(mce_next_interval);
1360
1361         __restart_timer(t, interval);
1362
1363         if (interval < iv)
1364                 __this_cpu_write(mce_next_interval, interval);
1365 }
1366
1367 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1368 static void mce_timer_delete_all(void)
1369 {
1370         int cpu;
1371
1372         for_each_online_cpu(cpu)
1373                 del_timer_sync(&per_cpu(mce_timer, cpu));
1374 }
1375
1376 static void mce_do_trigger(struct work_struct *work)
1377 {
1378         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1379 }
1380
1381 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1382
1383 /*
1384  * Notify the user(s) about new machine check events.
1385  * Can be called from interrupt context, but not from machine check/NMI
1386  * context.
1387  */
1388 int mce_notify_irq(void)
1389 {
1390         /* Not more than two messages every minute */
1391         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1392
1393         if (test_and_clear_bit(0, &mce_need_notify)) {
1394                 /* wake processes polling /dev/mcelog */
1395                 wake_up_interruptible(&mce_chrdev_wait);
1396
1397                 if (mce_helper[0])
1398                         schedule_work(&mce_trigger_work);
1399
1400                 if (__ratelimit(&ratelimit))
1401                         pr_info(HW_ERR "Machine check events logged\n");
1402
1403                 return 1;
1404         }
1405         return 0;
1406 }
1407 EXPORT_SYMBOL_GPL(mce_notify_irq);
1408
1409 static int __mcheck_cpu_mce_banks_init(void)
1410 {
1411         int i;
1412         u8 num_banks = mca_cfg.banks;
1413
1414         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1415         if (!mce_banks)
1416                 return -ENOMEM;
1417
1418         for (i = 0; i < num_banks; i++) {
1419                 struct mce_bank *b = &mce_banks[i];
1420
1421                 b->ctl = -1ULL;
1422                 b->init = 1;
1423         }
1424         return 0;
1425 }
1426
1427 /*
1428  * Initialize Machine Checks for a CPU.
1429  */
1430 static int __mcheck_cpu_cap_init(void)
1431 {
1432         unsigned b;
1433         u64 cap;
1434
1435         rdmsrl(MSR_IA32_MCG_CAP, cap);
1436
1437         b = cap & MCG_BANKCNT_MASK;
1438         if (!mca_cfg.banks)
1439                 pr_info("CPU supports %d MCE banks\n", b);
1440
1441         if (b > MAX_NR_BANKS) {
1442                 pr_warn("Using only %u machine check banks out of %u\n",
1443                         MAX_NR_BANKS, b);
1444                 b = MAX_NR_BANKS;
1445         }
1446
1447         /* Don't support asymmetric configurations today */
1448         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1449         mca_cfg.banks = b;
1450
1451         if (!mce_banks) {
1452                 int err = __mcheck_cpu_mce_banks_init();
1453
1454                 if (err)
1455                         return err;
1456         }
1457
1458         /* Use accurate RIP reporting if available. */
1459         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1460                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1461
1462         if (cap & MCG_SER_P)
1463                 mca_cfg.ser = true;
1464
1465         return 0;
1466 }
1467
1468 static void __mcheck_cpu_init_generic(void)
1469 {
1470         enum mcp_flags m_fl = 0;
1471         mce_banks_t all_banks;
1472         u64 cap;
1473         int i;
1474
1475         if (!mca_cfg.bootlog)
1476                 m_fl = MCP_DONTLOG;
1477
1478         /*
1479          * Log the machine checks left over from the previous reset.
1480          */
1481         bitmap_fill(all_banks, MAX_NR_BANKS);
1482         machine_check_poll(MCP_UC | m_fl, &all_banks);
1483
1484         cr4_set_bits(X86_CR4_MCE);
1485
1486         rdmsrl(MSR_IA32_MCG_CAP, cap);
1487         if (cap & MCG_CTL_P)
1488                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1489
1490         for (i = 0; i < mca_cfg.banks; i++) {
1491                 struct mce_bank *b = &mce_banks[i];
1492
1493                 if (!b->init)
1494                         continue;
1495                 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1496                 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1497         }
1498 }
1499
1500 /*
1501  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1502  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1503  * Vol 3B Table 15-20). But this confuses both the code that determines
1504  * whether the machine check occurred in kernel or user mode, and also
1505  * the severity assessment code. Pretend that EIPV was set, and take the
1506  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1507  */
1508 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1509 {
1510         if (bank != 0)
1511                 return;
1512         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1513                 return;
1514         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1515                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1516                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1517                           MCACOD)) !=
1518                          (MCI_STATUS_UC|MCI_STATUS_EN|
1519                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1520                           MCI_STATUS_AR|MCACOD_INSTR))
1521                 return;
1522
1523         m->mcgstatus |= MCG_STATUS_EIPV;
1524         m->ip = regs->ip;
1525         m->cs = regs->cs;
1526 }
1527
1528 /* Add per CPU specific workarounds here */
1529 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1530 {
1531         struct mca_config *cfg = &mca_cfg;
1532
1533         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1534                 pr_info("unknown CPU type - not enabling MCE support\n");
1535                 return -EOPNOTSUPP;
1536         }
1537
1538         /* This should be disabled by the BIOS, but isn't always */
1539         if (c->x86_vendor == X86_VENDOR_AMD) {
1540                 if (c->x86 == 15 && cfg->banks > 4) {
1541                         /*
1542                          * disable GART TBL walk error reporting, which
1543                          * trips off incorrectly with the IOMMU & 3ware
1544                          * & Cerberus:
1545                          */
1546                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1547                 }
1548                 if (c->x86 <= 17 && cfg->bootlog < 0) {
1549                         /*
1550                          * Lots of broken BIOS around that don't clear them
1551                          * by default and leave crap in there. Don't log:
1552                          */
1553                         cfg->bootlog = 0;
1554                 }
1555                 /*
1556                  * Various K7s with broken bank 0 around. Always disable
1557                  * by default.
1558                  */
1559                 if (c->x86 == 6 && cfg->banks > 0)
1560                         mce_banks[0].ctl = 0;
1561
1562                 /*
1563                  * overflow_recov is supported for F15h Models 00h-0fh
1564                  * even though we don't have a CPUID bit for it.
1565                  */
1566                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1567                         mce_flags.overflow_recov = 1;
1568
1569                 /*
1570                  * Turn off MC4_MISC thresholding banks on those models since
1571                  * they're not supported there.
1572                  */
1573                 if (c->x86 == 0x15 &&
1574                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1575                         int i;
1576                         u64 hwcr;
1577                         bool need_toggle;
1578                         u32 msrs[] = {
1579                                 0x00000413, /* MC4_MISC0 */
1580                                 0xc0000408, /* MC4_MISC1 */
1581                         };
1582
1583                         rdmsrl(MSR_K7_HWCR, hwcr);
1584
1585                         /* McStatusWrEn has to be set */
1586                         need_toggle = !(hwcr & BIT(18));
1587
1588                         if (need_toggle)
1589                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1590
1591                         /* Clear CntP bit safely */
1592                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1593                                 msr_clear_bit(msrs[i], 62);
1594
1595                         /* restore old settings */
1596                         if (need_toggle)
1597                                 wrmsrl(MSR_K7_HWCR, hwcr);
1598                 }
1599         }
1600
1601         if (c->x86_vendor == X86_VENDOR_INTEL) {
1602                 /*
1603                  * SDM documents that on family 6 bank 0 should not be written
1604                  * because it aliases to another special BIOS controlled
1605                  * register.
1606                  * But it's not aliased anymore on model 0x1a+
1607                  * Don't ignore bank 0 completely because there could be a
1608                  * valid event later, merely don't write CTL0.
1609                  */
1610
1611                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1612                         mce_banks[0].init = 0;
1613
1614                 /*
1615                  * All newer Intel systems support MCE broadcasting. Enable
1616                  * synchronization with a one second timeout.
1617                  */
1618                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1619                         cfg->monarch_timeout < 0)
1620                         cfg->monarch_timeout = USEC_PER_SEC;
1621
1622                 /*
1623                  * There are also broken BIOSes on some Pentium M and
1624                  * earlier systems:
1625                  */
1626                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1627                         cfg->bootlog = 0;
1628
1629                 if (c->x86 == 6 && c->x86_model == 45)
1630                         quirk_no_way_out = quirk_sandybridge_ifu;
1631         }
1632         if (cfg->monarch_timeout < 0)
1633                 cfg->monarch_timeout = 0;
1634         if (cfg->bootlog != 0)
1635                 cfg->panic_timeout = 30;
1636
1637         return 0;
1638 }
1639
1640 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1641 {
1642         if (c->x86 != 5)
1643                 return 0;
1644
1645         switch (c->x86_vendor) {
1646         case X86_VENDOR_INTEL:
1647                 intel_p5_mcheck_init(c);
1648                 return 1;
1649                 break;
1650         case X86_VENDOR_CENTAUR:
1651                 winchip_mcheck_init(c);
1652                 return 1;
1653                 break;
1654         }
1655
1656         return 0;
1657 }
1658
1659 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1660 {
1661         switch (c->x86_vendor) {
1662         case X86_VENDOR_INTEL:
1663                 mce_intel_feature_init(c);
1664                 mce_adjust_timer = cmci_intel_adjust_timer;
1665                 break;
1666
1667         case X86_VENDOR_AMD: {
1668                 u32 ebx = cpuid_ebx(0x80000007);
1669
1670                 mce_amd_feature_init(c);
1671                 mce_flags.overflow_recov = !!(ebx & BIT(0));
1672                 mce_flags.succor         = !!(ebx & BIT(1));
1673                 break;
1674                 }
1675
1676         default:
1677                 break;
1678         }
1679 }
1680
1681 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1682 {
1683         unsigned long iv = check_interval * HZ;
1684
1685         if (mca_cfg.ignore_ce || !iv)
1686                 return;
1687
1688         per_cpu(mce_next_interval, cpu) = iv;
1689
1690         t->expires = round_jiffies(jiffies + iv);
1691         add_timer_on(t, cpu);
1692 }
1693
1694 static void __mcheck_cpu_init_timer(void)
1695 {
1696         struct timer_list *t = this_cpu_ptr(&mce_timer);
1697         unsigned int cpu = smp_processor_id();
1698
1699         setup_timer(t, mce_timer_fn, cpu);
1700         mce_start_timer(cpu, t);
1701 }
1702
1703 /* Handle unconfigured int18 (should never happen) */
1704 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1705 {
1706         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1707                smp_processor_id());
1708 }
1709
1710 /* Call the installed machine check handler for this CPU setup. */
1711 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1712                                                 unexpected_machine_check;
1713
1714 /*
1715  * Called for each booted CPU to set up machine checks.
1716  * Must be called with preempt off:
1717  */
1718 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1719 {
1720         if (mca_cfg.disabled)
1721                 return;
1722
1723         if (__mcheck_cpu_ancient_init(c))
1724                 return;
1725
1726         if (!mce_available(c))
1727                 return;
1728
1729         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1730                 mca_cfg.disabled = true;
1731                 return;
1732         }
1733
1734         machine_check_vector = do_machine_check;
1735
1736         __mcheck_cpu_init_generic();
1737         __mcheck_cpu_init_vendor(c);
1738         __mcheck_cpu_init_timer();
1739         INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
1740         init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
1741 }
1742
1743 /*
1744  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1745  */
1746
1747 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1748 static int mce_chrdev_open_count;       /* #times opened */
1749 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1750
1751 static int mce_chrdev_open(struct inode *inode, struct file *file)
1752 {
1753         spin_lock(&mce_chrdev_state_lock);
1754
1755         if (mce_chrdev_open_exclu ||
1756             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1757                 spin_unlock(&mce_chrdev_state_lock);
1758
1759                 return -EBUSY;
1760         }
1761
1762         if (file->f_flags & O_EXCL)
1763                 mce_chrdev_open_exclu = 1;
1764         mce_chrdev_open_count++;
1765
1766         spin_unlock(&mce_chrdev_state_lock);
1767
1768         return nonseekable_open(inode, file);
1769 }
1770
1771 static int mce_chrdev_release(struct inode *inode, struct file *file)
1772 {
1773         spin_lock(&mce_chrdev_state_lock);
1774
1775         mce_chrdev_open_count--;
1776         mce_chrdev_open_exclu = 0;
1777
1778         spin_unlock(&mce_chrdev_state_lock);
1779
1780         return 0;
1781 }
1782
1783 static void collect_tscs(void *data)
1784 {
1785         unsigned long *cpu_tsc = (unsigned long *)data;
1786
1787         rdtscll(cpu_tsc[smp_processor_id()]);
1788 }
1789
1790 static int mce_apei_read_done;
1791
1792 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1793 static int __mce_read_apei(char __user **ubuf, size_t usize)
1794 {
1795         int rc;
1796         u64 record_id;
1797         struct mce m;
1798
1799         if (usize < sizeof(struct mce))
1800                 return -EINVAL;
1801
1802         rc = apei_read_mce(&m, &record_id);
1803         /* Error or no more MCE record */
1804         if (rc <= 0) {
1805                 mce_apei_read_done = 1;
1806                 /*
1807                  * When ERST is disabled, mce_chrdev_read() should return
1808                  * "no record" instead of "no device."
1809                  */
1810                 if (rc == -ENODEV)
1811                         return 0;
1812                 return rc;
1813         }
1814         rc = -EFAULT;
1815         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1816                 return rc;
1817         /*
1818          * In fact, we should have cleared the record after that has
1819          * been flushed to the disk or sent to network in
1820          * /sbin/mcelog, but we have no interface to support that now,
1821          * so just clear it to avoid duplication.
1822          */
1823         rc = apei_clear_mce(record_id);
1824         if (rc) {
1825                 mce_apei_read_done = 1;
1826                 return rc;
1827         }
1828         *ubuf += sizeof(struct mce);
1829
1830         return 0;
1831 }
1832
1833 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1834                                 size_t usize, loff_t *off)
1835 {
1836         char __user *buf = ubuf;
1837         unsigned long *cpu_tsc;
1838         unsigned prev, next;
1839         int i, err;
1840
1841         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1842         if (!cpu_tsc)
1843                 return -ENOMEM;
1844
1845         mutex_lock(&mce_chrdev_read_mutex);
1846
1847         if (!mce_apei_read_done) {
1848                 err = __mce_read_apei(&buf, usize);
1849                 if (err || buf != ubuf)
1850                         goto out;
1851         }
1852
1853         next = rcu_dereference_check_mce(mcelog.next);
1854
1855         /* Only supports full reads right now */
1856         err = -EINVAL;
1857         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1858                 goto out;
1859
1860         err = 0;
1861         prev = 0;
1862         do {
1863                 for (i = prev; i < next; i++) {
1864                         unsigned long start = jiffies;
1865                         struct mce *m = &mcelog.entry[i];
1866
1867                         while (!m->finished) {
1868                                 if (time_after_eq(jiffies, start + 2)) {
1869                                         memset(m, 0, sizeof(*m));
1870                                         goto timeout;
1871                                 }
1872                                 cpu_relax();
1873                         }
1874                         smp_rmb();
1875                         err |= copy_to_user(buf, m, sizeof(*m));
1876                         buf += sizeof(*m);
1877 timeout:
1878                         ;
1879                 }
1880
1881                 memset(mcelog.entry + prev, 0,
1882                        (next - prev) * sizeof(struct mce));
1883                 prev = next;
1884                 next = cmpxchg(&mcelog.next, prev, 0);
1885         } while (next != prev);
1886
1887         synchronize_sched();
1888
1889         /*
1890          * Collect entries that were still getting written before the
1891          * synchronize.
1892          */
1893         on_each_cpu(collect_tscs, cpu_tsc, 1);
1894
1895         for (i = next; i < MCE_LOG_LEN; i++) {
1896                 struct mce *m = &mcelog.entry[i];
1897
1898                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1899                         err |= copy_to_user(buf, m, sizeof(*m));
1900                         smp_rmb();
1901                         buf += sizeof(*m);
1902                         memset(m, 0, sizeof(*m));
1903                 }
1904         }
1905
1906         if (err)
1907                 err = -EFAULT;
1908
1909 out:
1910         mutex_unlock(&mce_chrdev_read_mutex);
1911         kfree(cpu_tsc);
1912
1913         return err ? err : buf - ubuf;
1914 }
1915
1916 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1917 {
1918         poll_wait(file, &mce_chrdev_wait, wait);
1919         if (READ_ONCE(mcelog.next))
1920                 return POLLIN | POLLRDNORM;
1921         if (!mce_apei_read_done && apei_check_mce())
1922                 return POLLIN | POLLRDNORM;
1923         return 0;
1924 }
1925
1926 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1927                                 unsigned long arg)
1928 {
1929         int __user *p = (int __user *)arg;
1930
1931         if (!capable(CAP_SYS_ADMIN))
1932                 return -EPERM;
1933
1934         switch (cmd) {
1935         case MCE_GET_RECORD_LEN:
1936                 return put_user(sizeof(struct mce), p);
1937         case MCE_GET_LOG_LEN:
1938                 return put_user(MCE_LOG_LEN, p);
1939         case MCE_GETCLEAR_FLAGS: {
1940                 unsigned flags;
1941
1942                 do {
1943                         flags = mcelog.flags;
1944                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1945
1946                 return put_user(flags, p);
1947         }
1948         default:
1949                 return -ENOTTY;
1950         }
1951 }
1952
1953 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1954                             size_t usize, loff_t *off);
1955
1956 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1957                              const char __user *ubuf,
1958                              size_t usize, loff_t *off))
1959 {
1960         mce_write = fn;
1961 }
1962 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1963
1964 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1965                                 size_t usize, loff_t *off)
1966 {
1967         if (mce_write)
1968                 return mce_write(filp, ubuf, usize, off);
1969         else
1970                 return -EINVAL;
1971 }
1972
1973 static const struct file_operations mce_chrdev_ops = {
1974         .open                   = mce_chrdev_open,
1975         .release                = mce_chrdev_release,
1976         .read                   = mce_chrdev_read,
1977         .write                  = mce_chrdev_write,
1978         .poll                   = mce_chrdev_poll,
1979         .unlocked_ioctl         = mce_chrdev_ioctl,
1980         .llseek                 = no_llseek,
1981 };
1982
1983 static struct miscdevice mce_chrdev_device = {
1984         MISC_MCELOG_MINOR,
1985         "mcelog",
1986         &mce_chrdev_ops,
1987 };
1988
1989 static void __mce_disable_bank(void *arg)
1990 {
1991         int bank = *((int *)arg);
1992         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1993         cmci_disable_bank(bank);
1994 }
1995
1996 void mce_disable_bank(int bank)
1997 {
1998         if (bank >= mca_cfg.banks) {
1999                 pr_warn(FW_BUG
2000                         "Ignoring request to disable invalid MCA bank %d.\n",
2001                         bank);
2002                 return;
2003         }
2004         set_bit(bank, mce_banks_ce_disabled);
2005         on_each_cpu(__mce_disable_bank, &bank, 1);
2006 }
2007
2008 /*
2009  * mce=off Disables machine check
2010  * mce=no_cmci Disables CMCI
2011  * mce=no_lmce Disables LMCE
2012  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2013  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2014  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2015  *      monarchtimeout is how long to wait for other CPUs on machine
2016  *      check, or 0 to not wait
2017  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2018  * mce=nobootlog Don't log MCEs from before booting.
2019  * mce=bios_cmci_threshold Don't program the CMCI threshold
2020  */
2021 static int __init mcheck_enable(char *str)
2022 {
2023         struct mca_config *cfg = &mca_cfg;
2024
2025         if (*str == 0) {
2026                 enable_p5_mce();
2027                 return 1;
2028         }
2029         if (*str == '=')
2030                 str++;
2031         if (!strcmp(str, "off"))
2032                 cfg->disabled = true;
2033         else if (!strcmp(str, "no_cmci"))
2034                 cfg->cmci_disabled = true;
2035         else if (!strcmp(str, "no_lmce"))
2036                 cfg->lmce_disabled = true;
2037         else if (!strcmp(str, "dont_log_ce"))
2038                 cfg->dont_log_ce = true;
2039         else if (!strcmp(str, "ignore_ce"))
2040                 cfg->ignore_ce = true;
2041         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2042                 cfg->bootlog = (str[0] == 'b');
2043         else if (!strcmp(str, "bios_cmci_threshold"))
2044                 cfg->bios_cmci_threshold = true;
2045         else if (isdigit(str[0])) {
2046                 if (get_option(&str, &cfg->tolerant) == 2)
2047                         get_option(&str, &(cfg->monarch_timeout));
2048         } else {
2049                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2050                 return 0;
2051         }
2052         return 1;
2053 }
2054 __setup("mce", mcheck_enable);
2055
2056 int __init mcheck_init(void)
2057 {
2058         mcheck_intel_therm_init();
2059         mcheck_vendor_init_severity();
2060
2061         return 0;
2062 }
2063
2064 /*
2065  * mce_syscore: PM support
2066  */
2067
2068 /*
2069  * Disable machine checks on suspend and shutdown. We can't really handle
2070  * them later.
2071  */
2072 static int mce_disable_error_reporting(void)
2073 {
2074         int i;
2075
2076         for (i = 0; i < mca_cfg.banks; i++) {
2077                 struct mce_bank *b = &mce_banks[i];
2078
2079                 if (b->init)
2080                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2081         }
2082         return 0;
2083 }
2084
2085 static int mce_syscore_suspend(void)
2086 {
2087         return mce_disable_error_reporting();
2088 }
2089
2090 static void mce_syscore_shutdown(void)
2091 {
2092         mce_disable_error_reporting();
2093 }
2094
2095 /*
2096  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2097  * Only one CPU is active at this time, the others get re-added later using
2098  * CPU hotplug:
2099  */
2100 static void mce_syscore_resume(void)
2101 {
2102         __mcheck_cpu_init_generic();
2103         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2104 }
2105
2106 static struct syscore_ops mce_syscore_ops = {
2107         .suspend        = mce_syscore_suspend,
2108         .shutdown       = mce_syscore_shutdown,
2109         .resume         = mce_syscore_resume,
2110 };
2111
2112 /*
2113  * mce_device: Sysfs support
2114  */
2115
2116 static void mce_cpu_restart(void *data)
2117 {
2118         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2119                 return;
2120         __mcheck_cpu_init_generic();
2121         __mcheck_cpu_init_timer();
2122 }
2123
2124 /* Reinit MCEs after user configuration changes */
2125 static void mce_restart(void)
2126 {
2127         mce_timer_delete_all();
2128         on_each_cpu(mce_cpu_restart, NULL, 1);
2129 }
2130
2131 /* Toggle features for corrected errors */
2132 static void mce_disable_cmci(void *data)
2133 {
2134         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2135                 return;
2136         cmci_clear();
2137 }
2138
2139 static void mce_enable_ce(void *all)
2140 {
2141         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2142                 return;
2143         cmci_reenable();
2144         cmci_recheck();
2145         if (all)
2146                 __mcheck_cpu_init_timer();
2147 }
2148
2149 static struct bus_type mce_subsys = {
2150         .name           = "machinecheck",
2151         .dev_name       = "machinecheck",
2152 };
2153
2154 DEFINE_PER_CPU(struct device *, mce_device);
2155
2156 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2157
2158 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2159 {
2160         return container_of(attr, struct mce_bank, attr);
2161 }
2162
2163 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2164                          char *buf)
2165 {
2166         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2167 }
2168
2169 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2170                         const char *buf, size_t size)
2171 {
2172         u64 new;
2173
2174         if (kstrtou64(buf, 0, &new) < 0)
2175                 return -EINVAL;
2176
2177         attr_to_bank(attr)->ctl = new;
2178         mce_restart();
2179
2180         return size;
2181 }
2182
2183 static ssize_t
2184 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2185 {
2186         strcpy(buf, mce_helper);
2187         strcat(buf, "\n");
2188         return strlen(mce_helper) + 1;
2189 }
2190
2191 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2192                                 const char *buf, size_t siz)
2193 {
2194         char *p;
2195
2196         strncpy(mce_helper, buf, sizeof(mce_helper));
2197         mce_helper[sizeof(mce_helper)-1] = 0;
2198         p = strchr(mce_helper, '\n');
2199
2200         if (p)
2201                 *p = 0;
2202
2203         return strlen(mce_helper) + !!p;
2204 }
2205
2206 static ssize_t set_ignore_ce(struct device *s,
2207                              struct device_attribute *attr,
2208                              const char *buf, size_t size)
2209 {
2210         u64 new;
2211
2212         if (kstrtou64(buf, 0, &new) < 0)
2213                 return -EINVAL;
2214
2215         if (mca_cfg.ignore_ce ^ !!new) {
2216                 if (new) {
2217                         /* disable ce features */
2218                         mce_timer_delete_all();
2219                         on_each_cpu(mce_disable_cmci, NULL, 1);
2220                         mca_cfg.ignore_ce = true;
2221                 } else {
2222                         /* enable ce features */
2223                         mca_cfg.ignore_ce = false;
2224                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2225                 }
2226         }
2227         return size;
2228 }
2229
2230 static ssize_t set_cmci_disabled(struct device *s,
2231                                  struct device_attribute *attr,
2232                                  const char *buf, size_t size)
2233 {
2234         u64 new;
2235
2236         if (kstrtou64(buf, 0, &new) < 0)
2237                 return -EINVAL;
2238
2239         if (mca_cfg.cmci_disabled ^ !!new) {
2240                 if (new) {
2241                         /* disable cmci */
2242                         on_each_cpu(mce_disable_cmci, NULL, 1);
2243                         mca_cfg.cmci_disabled = true;
2244                 } else {
2245                         /* enable cmci */
2246                         mca_cfg.cmci_disabled = false;
2247                         on_each_cpu(mce_enable_ce, NULL, 1);
2248                 }
2249         }
2250         return size;
2251 }
2252
2253 static ssize_t store_int_with_restart(struct device *s,
2254                                       struct device_attribute *attr,
2255                                       const char *buf, size_t size)
2256 {
2257         ssize_t ret = device_store_int(s, attr, buf, size);
2258         mce_restart();
2259         return ret;
2260 }
2261
2262 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2263 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2264 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2265 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2266
2267 static struct dev_ext_attribute dev_attr_check_interval = {
2268         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2269         &check_interval
2270 };
2271
2272 static struct dev_ext_attribute dev_attr_ignore_ce = {
2273         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2274         &mca_cfg.ignore_ce
2275 };
2276
2277 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2278         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2279         &mca_cfg.cmci_disabled
2280 };
2281
2282 static struct device_attribute *mce_device_attrs[] = {
2283         &dev_attr_tolerant.attr,
2284         &dev_attr_check_interval.attr,
2285         &dev_attr_trigger,
2286         &dev_attr_monarch_timeout.attr,
2287         &dev_attr_dont_log_ce.attr,
2288         &dev_attr_ignore_ce.attr,
2289         &dev_attr_cmci_disabled.attr,
2290         NULL
2291 };
2292
2293 static cpumask_var_t mce_device_initialized;
2294
2295 static void mce_device_release(struct device *dev)
2296 {
2297         kfree(dev);
2298 }
2299
2300 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2301 static int mce_device_create(unsigned int cpu)
2302 {
2303         struct device *dev;
2304         int err;
2305         int i, j;
2306
2307         if (!mce_available(&boot_cpu_data))
2308                 return -EIO;
2309
2310         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2311         if (!dev)
2312                 return -ENOMEM;
2313         dev->id  = cpu;
2314         dev->bus = &mce_subsys;
2315         dev->release = &mce_device_release;
2316
2317         err = device_register(dev);
2318         if (err) {
2319                 put_device(dev);
2320                 return err;
2321         }
2322
2323         for (i = 0; mce_device_attrs[i]; i++) {
2324                 err = device_create_file(dev, mce_device_attrs[i]);
2325                 if (err)
2326                         goto error;
2327         }
2328         for (j = 0; j < mca_cfg.banks; j++) {
2329                 err = device_create_file(dev, &mce_banks[j].attr);
2330                 if (err)
2331                         goto error2;
2332         }
2333         cpumask_set_cpu(cpu, mce_device_initialized);
2334         per_cpu(mce_device, cpu) = dev;
2335
2336         return 0;
2337 error2:
2338         while (--j >= 0)
2339                 device_remove_file(dev, &mce_banks[j].attr);
2340 error:
2341         while (--i >= 0)
2342                 device_remove_file(dev, mce_device_attrs[i]);
2343
2344         device_unregister(dev);
2345
2346         return err;
2347 }
2348
2349 static void mce_device_remove(unsigned int cpu)
2350 {
2351         struct device *dev = per_cpu(mce_device, cpu);
2352         int i;
2353
2354         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2355                 return;
2356
2357         for (i = 0; mce_device_attrs[i]; i++)
2358                 device_remove_file(dev, mce_device_attrs[i]);
2359
2360         for (i = 0; i < mca_cfg.banks; i++)
2361                 device_remove_file(dev, &mce_banks[i].attr);
2362
2363         device_unregister(dev);
2364         cpumask_clear_cpu(cpu, mce_device_initialized);
2365         per_cpu(mce_device, cpu) = NULL;
2366 }
2367
2368 /* Make sure there are no machine checks on offlined CPUs. */
2369 static void mce_disable_cpu(void *h)
2370 {
2371         unsigned long action = *(unsigned long *)h;
2372         int i;
2373
2374         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2375                 return;
2376
2377         if (!(action & CPU_TASKS_FROZEN))
2378                 cmci_clear();
2379         for (i = 0; i < mca_cfg.banks; i++) {
2380                 struct mce_bank *b = &mce_banks[i];
2381
2382                 if (b->init)
2383                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2384         }
2385 }
2386
2387 static void mce_reenable_cpu(void *h)
2388 {
2389         unsigned long action = *(unsigned long *)h;
2390         int i;
2391
2392         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2393                 return;
2394
2395         if (!(action & CPU_TASKS_FROZEN))
2396                 cmci_reenable();
2397         for (i = 0; i < mca_cfg.banks; i++) {
2398                 struct mce_bank *b = &mce_banks[i];
2399
2400                 if (b->init)
2401                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2402         }
2403 }
2404
2405 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2406 static int
2407 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2408 {
2409         unsigned int cpu = (unsigned long)hcpu;
2410         struct timer_list *t = &per_cpu(mce_timer, cpu);
2411
2412         switch (action & ~CPU_TASKS_FROZEN) {
2413         case CPU_ONLINE:
2414                 mce_device_create(cpu);
2415                 if (threshold_cpu_callback)
2416                         threshold_cpu_callback(action, cpu);
2417                 break;
2418         case CPU_DEAD:
2419                 if (threshold_cpu_callback)
2420                         threshold_cpu_callback(action, cpu);
2421                 mce_device_remove(cpu);
2422                 mce_intel_hcpu_update(cpu);
2423
2424                 /* intentionally ignoring frozen here */
2425                 if (!(action & CPU_TASKS_FROZEN))
2426                         cmci_rediscover();
2427                 break;
2428         case CPU_DOWN_PREPARE:
2429                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2430                 del_timer_sync(t);
2431                 break;
2432         case CPU_DOWN_FAILED:
2433                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2434                 mce_start_timer(cpu, t);
2435                 break;
2436         }
2437
2438         return NOTIFY_OK;
2439 }
2440
2441 static struct notifier_block mce_cpu_notifier = {
2442         .notifier_call = mce_cpu_callback,
2443 };
2444
2445 static __init void mce_init_banks(void)
2446 {
2447         int i;
2448
2449         for (i = 0; i < mca_cfg.banks; i++) {
2450                 struct mce_bank *b = &mce_banks[i];
2451                 struct device_attribute *a = &b->attr;
2452
2453                 sysfs_attr_init(&a->attr);
2454                 a->attr.name    = b->attrname;
2455                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2456
2457                 a->attr.mode    = 0644;
2458                 a->show         = show_bank;
2459                 a->store        = set_bank;
2460         }
2461 }
2462
2463 static __init int mcheck_init_device(void)
2464 {
2465         int err;
2466         int i = 0;
2467
2468         if (!mce_available(&boot_cpu_data)) {
2469                 err = -EIO;
2470                 goto err_out;
2471         }
2472
2473         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2474                 err = -ENOMEM;
2475                 goto err_out;
2476         }
2477
2478         mce_init_banks();
2479
2480         err = subsys_system_register(&mce_subsys, NULL);
2481         if (err)
2482                 goto err_out_mem;
2483
2484         cpu_notifier_register_begin();
2485         for_each_online_cpu(i) {
2486                 err = mce_device_create(i);
2487                 if (err) {
2488                         /*
2489                          * Register notifier anyway (and do not unreg it) so
2490                          * that we don't leave undeleted timers, see notifier
2491                          * callback above.
2492                          */
2493                         __register_hotcpu_notifier(&mce_cpu_notifier);
2494                         cpu_notifier_register_done();
2495                         goto err_device_create;
2496                 }
2497         }
2498
2499         __register_hotcpu_notifier(&mce_cpu_notifier);
2500         cpu_notifier_register_done();
2501
2502         register_syscore_ops(&mce_syscore_ops);
2503
2504         /* register character device /dev/mcelog */
2505         err = misc_register(&mce_chrdev_device);
2506         if (err)
2507                 goto err_register;
2508
2509         return 0;
2510
2511 err_register:
2512         unregister_syscore_ops(&mce_syscore_ops);
2513
2514 err_device_create:
2515         /*
2516          * We didn't keep track of which devices were created above, but
2517          * even if we had, the set of online cpus might have changed.
2518          * Play safe and remove for every possible cpu, since
2519          * mce_device_remove() will do the right thing.
2520          */
2521         for_each_possible_cpu(i)
2522                 mce_device_remove(i);
2523
2524 err_out_mem:
2525         free_cpumask_var(mce_device_initialized);
2526
2527 err_out:
2528         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2529
2530         return err;
2531 }
2532 device_initcall_sync(mcheck_init_device);
2533
2534 /*
2535  * Old style boot options parsing. Only for compatibility.
2536  */
2537 static int __init mcheck_disable(char *str)
2538 {
2539         mca_cfg.disabled = true;
2540         return 1;
2541 }
2542 __setup("nomce", mcheck_disable);
2543
2544 #ifdef CONFIG_DEBUG_FS
2545 struct dentry *mce_get_debugfs_dir(void)
2546 {
2547         static struct dentry *dmce;
2548
2549         if (!dmce)
2550                 dmce = debugfs_create_dir("mce", NULL);
2551
2552         return dmce;
2553 }
2554
2555 static void mce_reset(void)
2556 {
2557         cpu_missing = 0;
2558         atomic_set(&mce_fake_panicked, 0);
2559         atomic_set(&mce_executing, 0);
2560         atomic_set(&mce_callin, 0);
2561         atomic_set(&global_nwo, 0);
2562 }
2563
2564 static int fake_panic_get(void *data, u64 *val)
2565 {
2566         *val = fake_panic;
2567         return 0;
2568 }
2569
2570 static int fake_panic_set(void *data, u64 val)
2571 {
2572         mce_reset();
2573         fake_panic = val;
2574         return 0;
2575 }
2576
2577 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2578                         fake_panic_set, "%llu\n");
2579
2580 static int __init mcheck_debugfs_init(void)
2581 {
2582         struct dentry *dmce, *ffake_panic;
2583
2584         dmce = mce_get_debugfs_dir();
2585         if (!dmce)
2586                 return -ENOMEM;
2587         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2588                                           &fake_panic_fops);
2589         if (!ffake_panic)
2590                 return -ENOMEM;
2591
2592         return 0;
2593 }
2594 late_initcall(mcheck_debugfs_init);
2595 #endif