[PATCH] i386: add memory clobbers to syscall macros
[linux-2.6/verdex.git] / arch / x86_64 / kernel / mce.c
blob969365c0771b242f5b64071e294bba4cd114fca9
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
17 #include <linux/fs.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
22 #include <asm/msr.h>
23 #include <asm/mce.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
27 #define MISC_MCELOG_MINOR 227
28 #define NR_BANKS 5
30 static int mce_dont_init;
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
35 static int banks;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
39 static int rip_msr;
40 static int mce_bootlog;
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
48 struct mce_log mcelog = {
49 MCE_LOG_SIGNATURE,
50 MCE_LOG_LEN,
51 };
53 void mce_log(struct mce *mce)
55 unsigned next, entry;
56 mce->finished = 0;
57 smp_wmb();
58 for (;;) {
59 entry = rcu_dereference(mcelog.next);
60 for (;;) {
61 /* When the buffer fills up discard new entries. Assume
62 that the earlier errors are the more interesting. */
63 if (entry >= MCE_LOG_LEN) {
64 set_bit(MCE_OVERFLOW, &mcelog.flags);
65 return;
67 /* Old left over entry. Skip. */
68 if (mcelog.entry[entry].finished) {
69 entry++;
70 continue;
73 smp_rmb();
74 next = entry + 1;
75 if (cmpxchg(&mcelog.next, entry, next) == entry)
76 break;
78 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
79 smp_wmb();
80 mcelog.entry[entry].finished = 1;
81 smp_wmb();
83 if (!test_and_set_bit(0, &console_logged))
84 notify_user = 1;
87 static void print_mce(struct mce *m)
89 printk(KERN_EMERG "\n"
90 KERN_EMERG
91 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
92 m->cpu, m->mcgstatus, m->bank, m->status);
93 if (m->rip) {
94 printk(KERN_EMERG
95 "RIP%s %02x:<%016Lx> ",
96 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
97 m->cs, m->rip);
98 if (m->cs == __KERNEL_CS)
99 print_symbol("{%s}", m->rip);
100 printk("\n");
102 printk(KERN_EMERG "TSC %Lx ", m->tsc);
103 if (m->addr)
104 printk("ADDR %Lx ", m->addr);
105 if (m->misc)
106 printk("MISC %Lx ", m->misc);
107 printk("\n");
110 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
112 int i;
113 oops_begin();
114 for (i = 0; i < MCE_LOG_LEN; i++) {
115 unsigned long tsc = mcelog.entry[i].tsc;
116 if (time_before(tsc, start))
117 continue;
118 print_mce(&mcelog.entry[i]);
119 if (backup && mcelog.entry[i].tsc == backup->tsc)
120 backup = NULL;
122 if (backup)
123 print_mce(backup);
124 if (tolerant >= 3)
125 printk("Fake panic: %s\n", msg);
126 else
127 panic(msg);
130 static int mce_available(struct cpuinfo_x86 *c)
132 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
133 test_bit(X86_FEATURE_MCA, &c->x86_capability);
136 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
138 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
139 m->rip = regs->rip;
140 m->cs = regs->cs;
141 } else {
142 m->rip = 0;
143 m->cs = 0;
145 if (rip_msr) {
146 /* Assume the RIP in the MSR is exact. Is this true? */
147 m->mcgstatus |= MCG_STATUS_EIPV;
148 rdmsrl(rip_msr, m->rip);
149 m->cs = 0;
154 * The actual machine check handler
157 void do_machine_check(struct pt_regs * regs, long error_code)
159 struct mce m, panicm;
160 int nowayout = (tolerant < 1);
161 int kill_it = 0;
162 u64 mcestart = 0;
163 int i;
164 int panicm_found = 0;
166 if (regs)
167 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
168 if (!banks)
169 return;
171 memset(&m, 0, sizeof(struct mce));
172 m.cpu = hard_smp_processor_id();
173 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
174 if (!(m.mcgstatus & MCG_STATUS_RIPV))
175 kill_it = 1;
177 rdtscll(mcestart);
178 barrier();
180 for (i = 0; i < banks; i++) {
181 if (!bank[i])
182 continue;
184 m.misc = 0;
185 m.addr = 0;
186 m.bank = i;
187 m.tsc = 0;
189 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
190 if ((m.status & MCI_STATUS_VAL) == 0)
191 continue;
193 if (m.status & MCI_STATUS_EN) {
194 /* In theory _OVER could be a nowayout too, but
195 assume any overflowed errors were no fatal. */
196 nowayout |= !!(m.status & MCI_STATUS_PCC);
197 kill_it |= !!(m.status & MCI_STATUS_UC);
200 if (m.status & MCI_STATUS_MISCV)
201 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
202 if (m.status & MCI_STATUS_ADDRV)
203 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
205 mce_get_rip(&m, regs);
206 if (error_code >= 0)
207 rdtscll(m.tsc);
208 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
209 if (error_code != -2)
210 mce_log(&m);
212 /* Did this bank cause the exception? */
213 /* Assume that the bank with uncorrectable errors did it,
214 and that there is only a single one. */
215 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
216 panicm = m;
217 panicm_found = 1;
220 tainted |= TAINT_MACHINE_CHECK;
223 /* Never do anything final in the polling timer */
224 if (!regs)
225 goto out;
227 /* If we didn't find an uncorrectable error, pick
228 the last one (shouldn't happen, just being safe). */
229 if (!panicm_found)
230 panicm = m;
231 if (nowayout)
232 mce_panic("Machine check", &panicm, mcestart);
233 if (kill_it) {
234 int user_space = 0;
236 if (m.mcgstatus & MCG_STATUS_RIPV)
237 user_space = panicm.rip && (panicm.cs & 3);
239 /* When the machine was in user space and the CPU didn't get
240 confused it's normally not necessary to panic, unless you
241 are paranoid (tolerant == 0)
243 RED-PEN could be more tolerant for MCEs in idle,
244 but most likely they occur at boot anyways, where
245 it is best to just halt the machine. */
246 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
247 (unsigned)current->pid <= 1)
248 mce_panic("Uncorrected machine check", &panicm, mcestart);
250 /* do_exit takes an awful lot of locks and has as
251 slight risk of deadlocking. If you don't want that
252 don't set tolerant >= 2 */
253 if (tolerant < 3)
254 do_exit(SIGBUS);
257 out:
258 /* Last thing done in the machine check exception to clear state. */
259 wrmsrl(MSR_IA32_MCG_STATUS, 0);
263 * Periodic polling timer for "silent" machine check errors.
266 static int check_interval = 5 * 60; /* 5 minutes */
267 static void mcheck_timer(void *data);
268 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
270 static void mcheck_check_cpu(void *info)
272 if (mce_available(&current_cpu_data))
273 do_machine_check(NULL, 0);
276 static void mcheck_timer(void *data)
278 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
279 schedule_delayed_work(&mcheck_work, check_interval * HZ);
282 * It's ok to read stale data here for notify_user and
283 * console_logged as we'll simply get the updated versions
284 * on the next mcheck_timer execution and atomic operations
285 * on console_logged act as synchronization for notify_user
286 * writes.
288 if (notify_user && console_logged) {
289 notify_user = 0;
290 clear_bit(0, &console_logged);
291 printk(KERN_INFO "Machine check events logged\n");
296 static __init int periodic_mcheck_init(void)
298 if (check_interval)
299 schedule_delayed_work(&mcheck_work, check_interval*HZ);
300 return 0;
302 __initcall(periodic_mcheck_init);
306 * Initialize Machine Checks for a CPU.
308 static void mce_init(void *dummy)
310 u64 cap;
311 int i;
313 rdmsrl(MSR_IA32_MCG_CAP, cap);
314 banks = cap & 0xff;
315 if (banks > NR_BANKS) {
316 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
317 banks = NR_BANKS;
319 /* Use accurate RIP reporting if available. */
320 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
321 rip_msr = MSR_IA32_MCG_EIP;
323 /* Log the machine checks left over from the previous reset.
324 This also clears all registers */
325 do_machine_check(NULL, mce_bootlog ? -1 : -2);
327 set_in_cr4(X86_CR4_MCE);
329 if (cap & MCG_CTL_P)
330 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
332 for (i = 0; i < banks; i++) {
333 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
334 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
338 /* Add per CPU specific workarounds here */
339 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
341 /* This should be disabled by the BIOS, but isn't always */
342 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
343 /* disable GART TBL walk error reporting, which trips off
344 incorrectly with the IOMMU & 3ware & Cerberus. */
345 clear_bit(10, &bank[4]);
349 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
351 switch (c->x86_vendor) {
352 case X86_VENDOR_INTEL:
353 mce_intel_feature_init(c);
354 break;
355 default:
356 break;
361 * Called for each booted CPU to set up machine checks.
362 * Must be called with preempt off.
364 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
366 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
368 mce_cpu_quirks(c);
370 if (mce_dont_init ||
371 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
372 !mce_available(c))
373 return;
375 mce_init(NULL);
376 mce_cpu_features(c);
380 * Character device to read and clear the MCE log.
383 static void collect_tscs(void *data)
385 unsigned long *cpu_tsc = (unsigned long *)data;
386 rdtscll(cpu_tsc[smp_processor_id()]);
389 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
391 unsigned long *cpu_tsc;
392 static DECLARE_MUTEX(mce_read_sem);
393 unsigned next;
394 char __user *buf = ubuf;
395 int i, err;
397 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
398 if (!cpu_tsc)
399 return -ENOMEM;
401 down(&mce_read_sem);
402 next = rcu_dereference(mcelog.next);
404 /* Only supports full reads right now */
405 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
406 up(&mce_read_sem);
407 kfree(cpu_tsc);
408 return -EINVAL;
411 err = 0;
412 for (i = 0; i < next; i++) {
413 unsigned long start = jiffies;
414 while (!mcelog.entry[i].finished) {
415 if (!time_before(jiffies, start + 2)) {
416 memset(mcelog.entry + i,0, sizeof(struct mce));
417 continue;
419 cpu_relax();
421 smp_rmb();
422 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
423 buf += sizeof(struct mce);
426 memset(mcelog.entry, 0, next * sizeof(struct mce));
427 mcelog.next = 0;
429 synchronize_sched();
431 /* Collect entries that were still getting written before the synchronize. */
433 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
434 for (i = next; i < MCE_LOG_LEN; i++) {
435 if (mcelog.entry[i].finished &&
436 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
437 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
438 smp_rmb();
439 buf += sizeof(struct mce);
440 memset(&mcelog.entry[i], 0, sizeof(struct mce));
443 up(&mce_read_sem);
444 kfree(cpu_tsc);
445 return err ? -EFAULT : buf - ubuf;
448 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
450 int __user *p = (int __user *)arg;
451 if (!capable(CAP_SYS_ADMIN))
452 return -EPERM;
453 switch (cmd) {
454 case MCE_GET_RECORD_LEN:
455 return put_user(sizeof(struct mce), p);
456 case MCE_GET_LOG_LEN:
457 return put_user(MCE_LOG_LEN, p);
458 case MCE_GETCLEAR_FLAGS: {
459 unsigned flags;
460 do {
461 flags = mcelog.flags;
462 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
463 return put_user(flags, p);
465 default:
466 return -ENOTTY;
470 static struct file_operations mce_chrdev_ops = {
471 .read = mce_read,
472 .ioctl = mce_ioctl,
475 static struct miscdevice mce_log_device = {
476 MISC_MCELOG_MINOR,
477 "mcelog",
478 &mce_chrdev_ops,
482 * Old style boot options parsing. Only for compatibility.
485 static int __init mcheck_disable(char *str)
487 mce_dont_init = 1;
488 return 0;
491 /* mce=off disables machine check. Note you can reenable it later
492 using sysfs.
493 mce=TOLERANCELEVEL (number, see above)
494 mce=bootlog Log MCEs from before booting. Disabled by default to work
495 around buggy BIOS that leave bogus MCEs. */
496 static int __init mcheck_enable(char *str)
498 if (*str == '=')
499 str++;
500 if (!strcmp(str, "off"))
501 mce_dont_init = 1;
502 else if (!strcmp(str, "bootlog"))
503 mce_bootlog = 1;
504 else if (isdigit(str[0]))
505 get_option(&str, &tolerant);
506 else
507 printk("mce= argument %s ignored. Please use /sys", str);
508 return 0;
511 __setup("nomce", mcheck_disable);
512 __setup("mce", mcheck_enable);
515 * Sysfs support
518 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
519 Only one CPU is active at this time, the others get readded later using
520 CPU hotplug. */
521 static int mce_resume(struct sys_device *dev)
523 mce_init(NULL);
524 return 0;
527 /* Reinit MCEs after user configuration changes */
528 static void mce_restart(void)
530 if (check_interval)
531 cancel_delayed_work(&mcheck_work);
532 /* Timer race is harmless here */
533 on_each_cpu(mce_init, NULL, 1, 1);
534 if (check_interval)
535 schedule_delayed_work(&mcheck_work, check_interval*HZ);
538 static struct sysdev_class mce_sysclass = {
539 .resume = mce_resume,
540 set_kset_name("machinecheck"),
543 static DEFINE_PER_CPU(struct sys_device, device_mce);
545 /* Why are there no generic functions for this? */
546 #define ACCESSOR(name, var, start) \
547 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
548 return sprintf(buf, "%lx\n", (unsigned long)var); \
550 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
551 char *end; \
552 unsigned long new = simple_strtoul(buf, &end, 0); \
553 if (end == buf) return -EINVAL; \
554 var = new; \
555 start; \
556 return end-buf; \
558 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
560 ACCESSOR(bank0ctl,bank[0],mce_restart())
561 ACCESSOR(bank1ctl,bank[1],mce_restart())
562 ACCESSOR(bank2ctl,bank[2],mce_restart())
563 ACCESSOR(bank3ctl,bank[3],mce_restart())
564 ACCESSOR(bank4ctl,bank[4],mce_restart())
565 ACCESSOR(tolerant,tolerant,)
566 ACCESSOR(check_interval,check_interval,mce_restart())
568 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
569 static __cpuinit int mce_create_device(unsigned int cpu)
571 int err;
572 if (!mce_available(&cpu_data[cpu]))
573 return -EIO;
575 per_cpu(device_mce,cpu).id = cpu;
576 per_cpu(device_mce,cpu).cls = &mce_sysclass;
578 err = sysdev_register(&per_cpu(device_mce,cpu));
580 if (!err) {
581 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
582 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
583 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
584 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
586 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
587 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
589 return err;
592 #ifdef CONFIG_HOTPLUG_CPU
593 static __cpuinit void mce_remove_device(unsigned int cpu)
595 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
596 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
597 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
598 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
600 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
601 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
602 sysdev_unregister(&per_cpu(device_mce,cpu));
604 #endif
606 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
607 static __cpuinit int
608 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
610 unsigned int cpu = (unsigned long)hcpu;
612 switch (action) {
613 case CPU_ONLINE:
614 mce_create_device(cpu);
615 break;
616 #ifdef CONFIG_HOTPLUG_CPU
617 case CPU_DEAD:
618 mce_remove_device(cpu);
619 break;
620 #endif
622 return NOTIFY_OK;
625 static struct notifier_block mce_cpu_notifier = {
626 .notifier_call = mce_cpu_callback,
629 static __init int mce_init_device(void)
631 int err;
632 int i = 0;
634 if (!mce_available(&boot_cpu_data))
635 return -EIO;
636 err = sysdev_class_register(&mce_sysclass);
638 for_each_online_cpu(i) {
639 mce_create_device(i);
642 register_cpu_notifier(&mce_cpu_notifier);
643 misc_register(&mce_log_device);
644 return err;
647 device_initcall(mce_init_device);