2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
10 #include <linux/init.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/sched.h>
14 #include <linux/smp_lock.h>
15 #include <linux/string.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kallsyms.h>
18 #include <linux/sysdev.h>
19 #include <linux/miscdevice.h>
21 #include <linux/capability.h>
22 #include <linux/cpu.h>
23 #include <linux/percpu.h>
24 #include <linux/poll.h>
25 #include <linux/thread_info.h>
26 #include <linux/ctype.h>
27 #include <linux/kmod.h>
28 #include <linux/kdebug.h>
29 #include <linux/kobject.h>
30 #include <linux/sysfs.h>
31 #include <linux/ratelimit.h>
32 #include <asm/processor.h>
35 #include <asm/uaccess.h>
39 #define MISC_MCELOG_MINOR 227
43 static int mce_dont_init
;
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
52 static int tolerant
= 1;
55 static unsigned long notify_user
;
57 static int mce_bootlog
= -1;
58 static atomic_t mce_events
;
60 static char trigger
[128];
61 static char *trigger_argv
[2] = { trigger
, NULL
};
63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait
);
65 /* MCA banks polled by the period polling timer for corrected events */
66 DEFINE_PER_CPU(mce_banks_t
, mce_poll_banks
) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS
)-1] = ~0UL
70 /* Do initial initialization of a struct mce */
71 void mce_setup(struct mce
*m
)
73 memset(m
, 0, sizeof(struct mce
));
74 m
->cpu
= smp_processor_id();
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
84 static struct mce_log mcelog
= {
89 void mce_log(struct mce
*mce
)
92 atomic_inc(&mce_events
);
96 entry
= rcu_dereference(mcelog
.next
);
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry
>= MCE_LOG_LEN
) {
101 set_bit(MCE_OVERFLOW
, (unsigned long *)&mcelog
.flags
);
104 /* Old left over entry. Skip. */
105 if (mcelog
.entry
[entry
].finished
) {
113 if (cmpxchg(&mcelog
.next
, entry
, next
) == entry
)
116 memcpy(mcelog
.entry
+ entry
, mce
, sizeof(struct mce
));
118 mcelog
.entry
[entry
].finished
= 1;
121 set_bit(0, ¬ify_user
);
124 static void print_mce(struct mce
*m
)
126 printk(KERN_EMERG
"\n"
127 KERN_EMERG
"HARDWARE ERROR\n"
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m
->cpu
, m
->mcgstatus
, m
->bank
, m
->status
);
132 printk(KERN_EMERG
"RIP%s %02x:<%016Lx> ",
133 !(m
->mcgstatus
& MCG_STATUS_EIPV
) ? " !INEXACT!" : "",
135 if (m
->cs
== __KERNEL_CS
)
136 print_symbol("{%s}", m
->ip
);
139 printk(KERN_EMERG
"TSC %llx ", m
->tsc
);
141 printk("ADDR %llx ", m
->addr
);
143 printk("MISC %llx ", m
->misc
);
145 printk(KERN_EMERG
"This is not a software problem!\n");
146 printk(KERN_EMERG
"Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
150 static void mce_panic(char *msg
, struct mce
*backup
, unsigned long start
)
155 for (i
= 0; i
< MCE_LOG_LEN
; i
++) {
156 unsigned long tsc
= mcelog
.entry
[i
].tsc
;
158 if (time_before(tsc
, start
))
160 print_mce(&mcelog
.entry
[i
]);
161 if (backup
&& mcelog
.entry
[i
].tsc
== backup
->tsc
)
169 int mce_available(struct cpuinfo_x86
*c
)
173 return cpu_has(c
, X86_FEATURE_MCE
) && cpu_has(c
, X86_FEATURE_MCA
);
176 static inline void mce_get_rip(struct mce
*m
, struct pt_regs
*regs
)
178 if (regs
&& (m
->mcgstatus
& MCG_STATUS_RIPV
)) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m
->mcgstatus
|= MCG_STATUS_EIPV
;
188 rdmsrl(rip_msr
, m
->ip
);
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
197 * This is executed in standard interrupt context.
199 void machine_check_poll(enum mcp_flags flags
, mce_banks_t
*b
)
206 rdmsrl(MSR_IA32_MCG_STATUS
, m
.mcgstatus
);
207 for (i
= 0; i
< banks
; i
++) {
208 if (!bank
[i
] || !test_bit(i
, *b
))
217 rdmsrl(MSR_IA32_MC0_STATUS
+ i
*4, m
.status
);
218 if (!(m
.status
& MCI_STATUS_VAL
))
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
226 * TBD do the same check for MCI_STATUS_EN here?
228 if ((m
.status
& MCI_STATUS_UC
) && !(flags
& MCP_UC
))
231 if (m
.status
& MCI_STATUS_MISCV
)
232 rdmsrl(MSR_IA32_MC0_MISC
+ i
*4, m
.misc
);
233 if (m
.status
& MCI_STATUS_ADDRV
)
234 rdmsrl(MSR_IA32_MC0_ADDR
+ i
*4, m
.addr
);
236 if (!(flags
& MCP_TIMESTAMP
))
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
244 add_taint(TAINT_MACHINE_CHECK
);
247 * Clear state for this bank.
249 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
253 * Don't clear MCG_STATUS here because it's only defined for
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
266 void do_machine_check(struct pt_regs
* regs
, long error_code
)
268 struct mce m
, panicm
;
271 int panicm_found
= 0;
273 * If no_way_out gets set, there is no safe way to recover from this
274 * MCE. If tolerant is cranked up, we'll try anyway.
278 * If kill_it gets set, there might be a way to recover from this
282 DECLARE_BITMAP(toclear
, MAX_NR_BANKS
);
284 atomic_inc(&mce_entry
);
286 if (notify_die(DIE_NMI
, "machine check", regs
, error_code
,
287 18, SIGKILL
) == NOTIFY_STOP
)
294 rdmsrl(MSR_IA32_MCG_STATUS
, m
.mcgstatus
);
295 /* if the restart IP is not valid, we're done for */
296 if (!(m
.mcgstatus
& MCG_STATUS_RIPV
))
302 for (i
= 0; i
< banks
; i
++) {
303 __clear_bit(i
, toclear
);
311 rdmsrl(MSR_IA32_MC0_STATUS
+ i
*4, m
.status
);
312 if ((m
.status
& MCI_STATUS_VAL
) == 0)
316 * Non uncorrected errors are handled by machine_check_poll
319 if ((m
.status
& MCI_STATUS_UC
) == 0)
323 * Set taint even when machine check was not enabled.
325 add_taint(TAINT_MACHINE_CHECK
);
327 __set_bit(i
, toclear
);
329 if (m
.status
& MCI_STATUS_EN
) {
330 /* if PCC was set, there's no way out */
331 no_way_out
|= !!(m
.status
& MCI_STATUS_PCC
);
333 * If this error was uncorrectable and there was
334 * an overflow, we're in trouble. If no overflow,
335 * we might get away with just killing a task.
337 if (m
.status
& MCI_STATUS_UC
) {
338 if (tolerant
< 1 || m
.status
& MCI_STATUS_OVER
)
344 * Machine check event was not enabled. Clear, but
350 if (m
.status
& MCI_STATUS_MISCV
)
351 rdmsrl(MSR_IA32_MC0_MISC
+ i
*4, m
.misc
);
352 if (m
.status
& MCI_STATUS_ADDRV
)
353 rdmsrl(MSR_IA32_MC0_ADDR
+ i
*4, m
.addr
);
355 mce_get_rip(&m
, regs
);
358 /* Did this bank cause the exception? */
359 /* Assume that the bank with uncorrectable errors did it,
360 and that there is only a single one. */
361 if ((m
.status
& MCI_STATUS_UC
) && (m
.status
& MCI_STATUS_EN
)) {
367 /* If we didn't find an uncorrectable error, pick
368 the last one (shouldn't happen, just being safe). */
373 * If we have decided that we just CAN'T continue, and the user
374 * has not set tolerant to an insane level, give up and die.
376 if (no_way_out
&& tolerant
< 3)
377 mce_panic("Machine check", &panicm
, mcestart
);
380 * If the error seems to be unrecoverable, something should be
381 * done. Try to kill as little as possible. If we can kill just
382 * one task, do that. If the user has set the tolerance very
383 * high, don't try to do anything at all.
385 if (kill_it
&& tolerant
< 3) {
389 * If the EIPV bit is set, it means the saved IP is the
390 * instruction which caused the MCE.
392 if (m
.mcgstatus
& MCG_STATUS_EIPV
)
393 user_space
= panicm
.ip
&& (panicm
.cs
& 3);
396 * If we know that the error was in user space, send a
397 * SIGBUS. Otherwise, panic if tolerance is low.
399 * force_sig() takes an awful lot of locks and has a slight
400 * risk of deadlocking.
403 force_sig(SIGBUS
, current
);
404 } else if (panic_on_oops
|| tolerant
< 2) {
405 mce_panic("Uncorrected machine check",
410 /* notify userspace ASAP */
411 set_thread_flag(TIF_MCE_NOTIFY
);
413 /* the last thing we do is clear state */
414 for (i
= 0; i
< banks
; i
++) {
415 if (test_bit(i
, toclear
))
416 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
418 wrmsrl(MSR_IA32_MCG_STATUS
, 0);
420 atomic_dec(&mce_entry
);
423 #ifdef CONFIG_X86_MCE_INTEL
425 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
426 * @cpu: The CPU on which the event occurred.
427 * @status: Event status information
429 * This function should be called by the thermal interrupt after the
430 * event has been processed and the decision was made to log the event
433 * The status parameter will be saved to the 'status' field of 'struct mce'
434 * and historically has been the register value of the
435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 void mce_log_therm_throt_event(__u64 status
)
442 m
.bank
= MCE_THERMAL_BANK
;
446 #endif /* CONFIG_X86_MCE_INTEL */
449 * Periodic polling timer for "silent" machine check errors. If the
450 * poller finds an MCE, poll 2x faster. When the poller finds no more
451 * errors, poll 2x slower (up to check_interval seconds).
454 static int check_interval
= 5 * 60; /* 5 minutes */
455 static int next_interval
; /* in jiffies */
456 static void mcheck_timer(unsigned long);
457 static DEFINE_PER_CPU(struct timer_list
, mce_timer
);
459 static void mcheck_timer(unsigned long data
)
461 struct timer_list
*t
= &per_cpu(mce_timer
, data
);
463 WARN_ON(smp_processor_id() != data
);
465 if (mce_available(¤t_cpu_data
))
466 machine_check_poll(MCP_TIMESTAMP
,
467 &__get_cpu_var(mce_poll_banks
));
470 * Alert userspace if needed. If we logged an MCE, reduce the
471 * polling interval, otherwise increase the polling interval.
473 if (mce_notify_user()) {
474 next_interval
= max(next_interval
/2, HZ
/100);
476 next_interval
= min(next_interval
* 2,
477 (int)round_jiffies_relative(check_interval
*HZ
));
480 t
->expires
= jiffies
+ next_interval
;
484 static void mce_do_trigger(struct work_struct
*work
)
486 call_usermodehelper(trigger
, trigger_argv
, NULL
, UMH_NO_WAIT
);
489 static DECLARE_WORK(mce_trigger_work
, mce_do_trigger
);
492 * Notify the user(s) about new machine check events.
493 * Can be called from interrupt context, but not from machine check/NMI
496 int mce_notify_user(void)
498 /* Not more than two messages every minute */
499 static DEFINE_RATELIMIT_STATE(ratelimit
, 60*HZ
, 2);
501 clear_thread_flag(TIF_MCE_NOTIFY
);
502 if (test_and_clear_bit(0, ¬ify_user
)) {
503 wake_up_interruptible(&mce_wait
);
506 * There is no risk of missing notifications because
507 * work_pending is always cleared before the function is
510 if (trigger
[0] && !work_pending(&mce_trigger_work
))
511 schedule_work(&mce_trigger_work
);
513 if (__ratelimit(&ratelimit
))
514 printk(KERN_INFO
"Machine check events logged\n");
521 /* see if the idle task needs to notify userspace */
523 mce_idle_callback(struct notifier_block
*nfb
, unsigned long action
, void *junk
)
525 /* IDLE_END should be safe - interrupts are back on */
526 if (action
== IDLE_END
&& test_thread_flag(TIF_MCE_NOTIFY
))
532 static struct notifier_block mce_idle_notifier
= {
533 .notifier_call
= mce_idle_callback
,
536 static __init
int periodic_mcheck_init(void)
538 idle_notifier_register(&mce_idle_notifier
);
541 __initcall(periodic_mcheck_init
);
544 * Initialize Machine Checks for a CPU.
546 static int mce_cap_init(void)
551 rdmsrl(MSR_IA32_MCG_CAP
, cap
);
553 if (b
> MAX_NR_BANKS
) {
555 "MCE: Using only %u machine check banks out of %u\n",
560 /* Don't support asymmetric configurations today */
561 WARN_ON(banks
!= 0 && b
!= banks
);
564 bank
= kmalloc(banks
* sizeof(u64
), GFP_KERNEL
);
567 memset(bank
, 0xff, banks
* sizeof(u64
));
570 /* Use accurate RIP reporting if available. */
571 if ((cap
& (1<<9)) && ((cap
>> 16) & 0xff) >= 9)
572 rip_msr
= MSR_IA32_MCG_EIP
;
577 static void mce_init(void *dummy
)
581 mce_banks_t all_banks
;
584 * Log the machine checks left over from the previous reset.
586 bitmap_fill(all_banks
, MAX_NR_BANKS
);
587 machine_check_poll(MCP_UC
, &all_banks
);
589 set_in_cr4(X86_CR4_MCE
);
591 rdmsrl(MSR_IA32_MCG_CAP
, cap
);
593 wrmsr(MSR_IA32_MCG_CTL
, 0xffffffff, 0xffffffff);
595 for (i
= 0; i
< banks
; i
++) {
596 wrmsrl(MSR_IA32_MC0_CTL
+4*i
, bank
[i
]);
597 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
601 /* Add per CPU specific workarounds here */
602 static void mce_cpu_quirks(struct cpuinfo_x86
*c
)
604 /* This should be disabled by the BIOS, but isn't always */
605 if (c
->x86_vendor
== X86_VENDOR_AMD
) {
606 if (c
->x86
== 15 && banks
> 4)
607 /* disable GART TBL walk error reporting, which trips off
608 incorrectly with the IOMMU & 3ware & Cerberus. */
609 clear_bit(10, (unsigned long *)&bank
[4]);
610 if(c
->x86
<= 17 && mce_bootlog
< 0)
611 /* Lots of broken BIOS around that don't clear them
612 by default and leave crap in there. Don't log. */
618 static void mce_cpu_features(struct cpuinfo_x86
*c
)
620 switch (c
->x86_vendor
) {
621 case X86_VENDOR_INTEL
:
622 mce_intel_feature_init(c
);
625 mce_amd_feature_init(c
);
632 static void mce_init_timer(void)
634 struct timer_list
*t
= &__get_cpu_var(mce_timer
);
636 /* data race harmless because everyone sets to the same value */
638 next_interval
= check_interval
* HZ
;
641 setup_timer(t
, mcheck_timer
, smp_processor_id());
642 t
->expires
= round_jiffies(jiffies
+ next_interval
);
647 * Called for each booted CPU to set up machine checks.
648 * Must be called with preempt off.
650 void __cpuinit
mcheck_init(struct cpuinfo_x86
*c
)
652 if (!mce_available(c
))
655 if (mce_cap_init() < 0) {
667 * Character device to read and clear the MCE log.
670 static DEFINE_SPINLOCK(mce_state_lock
);
671 static int open_count
; /* #times opened */
672 static int open_exclu
; /* already open exclusive? */
674 static int mce_open(struct inode
*inode
, struct file
*file
)
677 spin_lock(&mce_state_lock
);
679 if (open_exclu
|| (open_count
&& (file
->f_flags
& O_EXCL
))) {
680 spin_unlock(&mce_state_lock
);
685 if (file
->f_flags
& O_EXCL
)
689 spin_unlock(&mce_state_lock
);
692 return nonseekable_open(inode
, file
);
695 static int mce_release(struct inode
*inode
, struct file
*file
)
697 spin_lock(&mce_state_lock
);
702 spin_unlock(&mce_state_lock
);
707 static void collect_tscs(void *data
)
709 unsigned long *cpu_tsc
= (unsigned long *)data
;
711 rdtscll(cpu_tsc
[smp_processor_id()]);
714 static ssize_t
mce_read(struct file
*filp
, char __user
*ubuf
, size_t usize
,
717 unsigned long *cpu_tsc
;
718 static DEFINE_MUTEX(mce_read_mutex
);
720 char __user
*buf
= ubuf
;
723 cpu_tsc
= kmalloc(nr_cpu_ids
* sizeof(long), GFP_KERNEL
);
727 mutex_lock(&mce_read_mutex
);
728 next
= rcu_dereference(mcelog
.next
);
730 /* Only supports full reads right now */
731 if (*off
!= 0 || usize
< MCE_LOG_LEN
*sizeof(struct mce
)) {
732 mutex_unlock(&mce_read_mutex
);
740 for (i
= prev
; i
< next
; i
++) {
741 unsigned long start
= jiffies
;
743 while (!mcelog
.entry
[i
].finished
) {
744 if (time_after_eq(jiffies
, start
+ 2)) {
745 memset(mcelog
.entry
+ i
, 0,
752 err
|= copy_to_user(buf
, mcelog
.entry
+ i
,
754 buf
+= sizeof(struct mce
);
759 memset(mcelog
.entry
+ prev
, 0,
760 (next
- prev
) * sizeof(struct mce
));
762 next
= cmpxchg(&mcelog
.next
, prev
, 0);
763 } while (next
!= prev
);
768 * Collect entries that were still getting written before the
771 on_each_cpu(collect_tscs
, cpu_tsc
, 1);
772 for (i
= next
; i
< MCE_LOG_LEN
; i
++) {
773 if (mcelog
.entry
[i
].finished
&&
774 mcelog
.entry
[i
].tsc
< cpu_tsc
[mcelog
.entry
[i
].cpu
]) {
775 err
|= copy_to_user(buf
, mcelog
.entry
+i
,
778 buf
+= sizeof(struct mce
);
779 memset(&mcelog
.entry
[i
], 0, sizeof(struct mce
));
782 mutex_unlock(&mce_read_mutex
);
784 return err
? -EFAULT
: buf
- ubuf
;
787 static unsigned int mce_poll(struct file
*file
, poll_table
*wait
)
789 poll_wait(file
, &mce_wait
, wait
);
790 if (rcu_dereference(mcelog
.next
))
791 return POLLIN
| POLLRDNORM
;
795 static long mce_ioctl(struct file
*f
, unsigned int cmd
, unsigned long arg
)
797 int __user
*p
= (int __user
*)arg
;
799 if (!capable(CAP_SYS_ADMIN
))
802 case MCE_GET_RECORD_LEN
:
803 return put_user(sizeof(struct mce
), p
);
804 case MCE_GET_LOG_LEN
:
805 return put_user(MCE_LOG_LEN
, p
);
806 case MCE_GETCLEAR_FLAGS
: {
810 flags
= mcelog
.flags
;
811 } while (cmpxchg(&mcelog
.flags
, flags
, 0) != flags
);
812 return put_user(flags
, p
);
819 static const struct file_operations mce_chrdev_ops
= {
821 .release
= mce_release
,
824 .unlocked_ioctl
= mce_ioctl
,
827 static struct miscdevice mce_log_device
= {
834 * Old style boot options parsing. Only for compatibility.
836 static int __init
mcheck_disable(char *str
)
842 /* mce=off disables machine check.
843 mce=TOLERANCELEVEL (number, see above)
844 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
845 mce=nobootlog Don't log MCEs from before booting. */
846 static int __init
mcheck_enable(char *str
)
848 if (!strcmp(str
, "off"))
850 else if (!strcmp(str
, "bootlog") || !strcmp(str
,"nobootlog"))
851 mce_bootlog
= str
[0] == 'b';
852 else if (isdigit(str
[0]))
853 get_option(&str
, &tolerant
);
855 printk("mce= argument %s ignored. Please use /sys", str
);
859 __setup("nomce", mcheck_disable
);
860 __setup("mce=", mcheck_enable
);
867 * Disable machine checks on suspend and shutdown. We can't really handle
870 static int mce_disable(void)
874 for (i
= 0; i
< banks
; i
++)
875 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, 0);
879 static int mce_suspend(struct sys_device
*dev
, pm_message_t state
)
881 return mce_disable();
884 static int mce_shutdown(struct sys_device
*dev
)
886 return mce_disable();
889 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
890 Only one CPU is active at this time, the others get readded later using
892 static int mce_resume(struct sys_device
*dev
)
895 mce_cpu_features(¤t_cpu_data
);
899 static void mce_cpu_restart(void *data
)
901 del_timer_sync(&__get_cpu_var(mce_timer
));
902 if (mce_available(¤t_cpu_data
))
907 /* Reinit MCEs after user configuration changes */
908 static void mce_restart(void)
910 next_interval
= check_interval
* HZ
;
911 on_each_cpu(mce_cpu_restart
, NULL
, 1);
914 static struct sysdev_class mce_sysclass
= {
915 .suspend
= mce_suspend
,
916 .shutdown
= mce_shutdown
,
917 .resume
= mce_resume
,
918 .name
= "machinecheck",
921 DEFINE_PER_CPU(struct sys_device
, device_mce
);
922 void (*threshold_cpu_callback
)(unsigned long action
, unsigned int cpu
) __cpuinitdata
;
924 /* Why are there no generic functions for this? */
925 #define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
943 static struct sysdev_attribute
*bank_attrs
;
945 static ssize_t
show_bank(struct sys_device
*s
, struct sysdev_attribute
*attr
,
948 u64 b
= bank
[attr
- bank_attrs
];
949 return sprintf(buf
, "%llx\n", b
);
952 static ssize_t
set_bank(struct sys_device
*s
, struct sysdev_attribute
*attr
,
953 const char *buf
, size_t siz
)
956 u64
new = simple_strtoull(buf
, &end
, 0);
959 bank
[attr
- bank_attrs
] = new;
964 static ssize_t
show_trigger(struct sys_device
*s
, struct sysdev_attribute
*attr
,
967 strcpy(buf
, trigger
);
969 return strlen(trigger
) + 1;
972 static ssize_t
set_trigger(struct sys_device
*s
, struct sysdev_attribute
*attr
,
973 const char *buf
,size_t siz
)
977 strncpy(trigger
, buf
, sizeof(trigger
));
978 trigger
[sizeof(trigger
)-1] = 0;
979 len
= strlen(trigger
);
980 p
= strchr(trigger
, '\n');
985 static SYSDEV_ATTR(trigger
, 0644, show_trigger
, set_trigger
);
986 static SYSDEV_INT_ATTR(tolerant
, 0644, tolerant
);
987 ACCESSOR(check_interval
,check_interval
,mce_restart())
988 static struct sysdev_attribute
*mce_attributes
[] = {
989 &attr_tolerant
.attr
, &attr_check_interval
, &attr_trigger
,
993 static cpumask_var_t mce_device_initialized
;
995 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996 static __cpuinit
int mce_create_device(unsigned int cpu
)
1001 if (!mce_available(&boot_cpu_data
))
1004 memset(&per_cpu(device_mce
, cpu
).kobj
, 0, sizeof(struct kobject
));
1005 per_cpu(device_mce
,cpu
).id
= cpu
;
1006 per_cpu(device_mce
,cpu
).cls
= &mce_sysclass
;
1008 err
= sysdev_register(&per_cpu(device_mce
,cpu
));
1012 for (i
= 0; mce_attributes
[i
]; i
++) {
1013 err
= sysdev_create_file(&per_cpu(device_mce
,cpu
),
1018 for (i
= 0; i
< banks
; i
++) {
1019 err
= sysdev_create_file(&per_cpu(device_mce
, cpu
),
1024 cpumask_set_cpu(cpu
, mce_device_initialized
);
1029 sysdev_remove_file(&per_cpu(device_mce
, cpu
),
1034 sysdev_remove_file(&per_cpu(device_mce
,cpu
),
1037 sysdev_unregister(&per_cpu(device_mce
,cpu
));
1042 static __cpuinit
void mce_remove_device(unsigned int cpu
)
1046 if (!cpumask_test_cpu(cpu
, mce_device_initialized
))
1049 for (i
= 0; mce_attributes
[i
]; i
++)
1050 sysdev_remove_file(&per_cpu(device_mce
,cpu
),
1052 for (i
= 0; i
< banks
; i
++)
1053 sysdev_remove_file(&per_cpu(device_mce
, cpu
),
1055 sysdev_unregister(&per_cpu(device_mce
,cpu
));
1056 cpumask_clear_cpu(cpu
, mce_device_initialized
);
1059 /* Make sure there are no machine checks on offlined CPUs. */
1060 static void mce_disable_cpu(void *h
)
1063 unsigned long action
= *(unsigned long *)h
;
1065 if (!mce_available(¤t_cpu_data
))
1067 if (!(action
& CPU_TASKS_FROZEN
))
1069 for (i
= 0; i
< banks
; i
++)
1070 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, 0);
1073 static void mce_reenable_cpu(void *h
)
1076 unsigned long action
= *(unsigned long *)h
;
1078 if (!mce_available(¤t_cpu_data
))
1080 if (!(action
& CPU_TASKS_FROZEN
))
1082 for (i
= 0; i
< banks
; i
++)
1083 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, bank
[i
]);
1086 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087 static int __cpuinit
mce_cpu_callback(struct notifier_block
*nfb
,
1088 unsigned long action
, void *hcpu
)
1090 unsigned int cpu
= (unsigned long)hcpu
;
1091 struct timer_list
*t
= &per_cpu(mce_timer
, cpu
);
1095 case CPU_ONLINE_FROZEN
:
1096 mce_create_device(cpu
);
1097 if (threshold_cpu_callback
)
1098 threshold_cpu_callback(action
, cpu
);
1101 case CPU_DEAD_FROZEN
:
1102 if (threshold_cpu_callback
)
1103 threshold_cpu_callback(action
, cpu
);
1104 mce_remove_device(cpu
);
1106 case CPU_DOWN_PREPARE
:
1107 case CPU_DOWN_PREPARE_FROZEN
:
1109 smp_call_function_single(cpu
, mce_disable_cpu
, &action
, 1);
1111 case CPU_DOWN_FAILED
:
1112 case CPU_DOWN_FAILED_FROZEN
:
1113 t
->expires
= round_jiffies(jiffies
+ next_interval
);
1114 add_timer_on(t
, cpu
);
1115 smp_call_function_single(cpu
, mce_reenable_cpu
, &action
, 1);
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu
);
1125 static struct notifier_block mce_cpu_notifier __cpuinitdata
= {
1126 .notifier_call
= mce_cpu_callback
,
1129 static __init
int mce_init_banks(void)
1133 bank_attrs
= kzalloc(sizeof(struct sysdev_attribute
) * banks
,
1138 for (i
= 0; i
< banks
; i
++) {
1139 struct sysdev_attribute
*a
= &bank_attrs
[i
];
1140 a
->attr
.name
= kasprintf(GFP_KERNEL
, "bank%d", i
);
1143 a
->attr
.mode
= 0644;
1144 a
->show
= show_bank
;
1145 a
->store
= set_bank
;
1151 kfree(bank_attrs
[i
].attr
.name
);
1157 static __init
int mce_init_device(void)
1162 if (!mce_available(&boot_cpu_data
))
1165 alloc_cpumask_var(&mce_device_initialized
, GFP_KERNEL
);
1167 err
= mce_init_banks();
1171 err
= sysdev_class_register(&mce_sysclass
);
1175 for_each_online_cpu(i
) {
1176 err
= mce_create_device(i
);
1181 register_hotcpu_notifier(&mce_cpu_notifier
);
1182 misc_register(&mce_log_device
);
1186 device_initcall(mce_init_device
);