1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char * const tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs
);
46 const char * const ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs
);
49 /* memory transaction type */
50 const char * const rrrr_msgs
[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs
);
55 /* participating processor */
56 const char * const pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs
);
60 const char * const to_msgs
[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs
);
64 const char * const ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs
);
67 static const char * const f15h_mc1_mce_desc
[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
87 static const char * const f15h_mc2_mce_desc
[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
101 "PRB address parity error"
104 static const char * const mc4_mce_desc
[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const mc5_mce_desc
[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
142 static bool f12h_mc0_mce(u16 ec
, u8 xec
)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll
== LL_L1
)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
160 static bool f10h_mc0_mce(u16 ec
, u8 xec
)
162 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
163 pr_cont("during data scrub.\n");
166 return f12h_mc0_mce(ec
, xec
);
169 static bool k8_mc0_mce(u16 ec
, u8 xec
)
172 pr_cont("during system linefill.\n");
176 return f10h_mc0_mce(ec
, xec
);
179 static bool f14h_mc0_mce(u16 ec
, u8 xec
)
186 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4
== R4_DRD
? "load/hw prf" : "store"));
196 pr_cont("Copyback parity error on a tag miss.\n");
199 pr_cont("Tag parity error during snoop.\n");
204 } else if (BUS_ERROR(ec
)) {
206 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
209 pr_cont("System read data error on a ");
213 pr_cont("TLB reload.\n");
231 static bool f15h_mc0_mce(u16 ec
, u8 xec
)
239 pr_cont("Data Array access error.\n");
243 pr_cont("UC error during a linefill from L2/NB.\n");
248 pr_cont("STQ access error.\n");
252 pr_cont("SCB access error.\n");
256 pr_cont("Tag error.\n");
260 pr_cont("LDQ access error.\n");
266 } else if (BUS_ERROR(ec
)) {
269 pr_cont("System Read Data Error.\n");
271 pr_cont(" Internal error condition type %d.\n", xec
);
278 static void decode_mc0_mce(struct mce
*m
)
280 u16 ec
= EC(m
->status
);
281 u8 xec
= XEC(m
->status
, xec_mask
);
283 pr_emerg(HW_ERR
"MC0 Error: ");
285 /* TLB error signatures are the same across families */
287 if (TT(ec
) == TT_DATA
) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
289 ((xec
== 2) ? "locked miss"
290 : (xec
? "multimatch" : "parity")));
293 } else if (fam_ops
->mc0_mce(ec
, xec
))
296 pr_emerg(HW_ERR
"Corrupted MC0 MCE info?\n");
299 static bool k8_mc1_mce(u16 ec
, u8 xec
)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll
== 0x1) {
312 pr_cont("Parity error during data load.\n");
316 pr_cont("Copyback Parity/Victim error.\n");
320 pr_cont("Tag Snoop error.\n");
333 static bool f14h_mc1_mce(u16 ec
, u8 xec
)
339 if (TT(ec
) != 0 || LL(ec
) != 1)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4
== R4_SNOOP
)
345 pr_cont("Tag error during snoop/victimization.\n");
352 static bool f15h_mc1_mce(u16 ec
, u8 xec
)
361 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
]);
365 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-2]);
369 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-4]);
373 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc
[xec
-4]);
382 static void decode_mc1_mce(struct mce
*m
)
384 u16 ec
= EC(m
->status
);
385 u8 xec
= XEC(m
->status
, xec_mask
);
387 pr_emerg(HW_ERR
"MC1 Error: ");
390 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
391 (xec
? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec
)) {
393 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
395 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
396 } else if (fam_ops
->mc1_mce(ec
, xec
))
399 pr_emerg(HW_ERR
"Corrupted MC1 MCE info?\n");
402 static void decode_mc2_mce(struct mce
*m
)
404 u16 ec
= EC(m
->status
);
405 u8 xec
= XEC(m
->status
, xec_mask
);
407 pr_emerg(HW_ERR
"MC2 Error");
410 pr_cont(" in the write data buffers.\n");
412 pr_cont(" in the victim data buffers.\n");
413 else if (xec
== 0x2 && MEM_ERROR(ec
))
414 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
415 else if (xec
== 0x0) {
417 pr_cont(": %s error in a Page Descriptor Cache or "
418 "Guest TLB.\n", TT_MSG(ec
));
419 else if (BUS_ERROR(ec
))
420 pr_cont(": %s/ECC error in data read from NB: %s.\n",
421 R4_MSG(ec
), PP_MSG(ec
));
422 else if (MEM_ERROR(ec
)) {
426 pr_cont(": %s error during data copyback.\n",
429 pr_cont(": %s parity/ECC error during data "
430 "access from L2.\n", R4_MSG(ec
));
441 pr_emerg(HW_ERR
"Corrupted MC2 MCE info?\n");
444 static void decode_f15_mc2_mce(struct mce
*m
)
446 u16 ec
= EC(m
->status
);
447 u8 xec
= XEC(m
->status
, xec_mask
);
449 pr_emerg(HW_ERR
"MC2 Error: ");
453 pr_cont("Data parity TLB read error.\n");
455 pr_cont("Poison data provided for TLB fill.\n");
457 goto wrong_f15_mc2_mce
;
458 } else if (BUS_ERROR(ec
)) {
460 goto wrong_f15_mc2_mce
;
462 pr_cont("Error during attempted NB data read.\n");
463 } else if (MEM_ERROR(ec
)) {
466 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x4]);
470 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x7]);
474 goto wrong_f15_mc2_mce
;
481 pr_emerg(HW_ERR
"Corrupted MC2 MCE info?\n");
484 static void decode_mc3_mce(struct mce
*m
)
486 u16 ec
= EC(m
->status
);
487 u8 xec
= XEC(m
->status
, xec_mask
);
489 if (boot_cpu_data
.x86
>= 0x14) {
490 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
491 " please report on LKML.\n");
495 pr_emerg(HW_ERR
"MC3 Error");
500 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
503 pr_cont(" during %s.\n", R4_MSG(ec
));
510 pr_emerg(HW_ERR
"Corrupted MC3 MCE info?\n");
513 static void decode_mc4_mce(struct mce
*m
)
515 struct cpuinfo_x86
*c
= &boot_cpu_data
;
516 int node_id
= amd_get_nb_id(m
->extcpu
);
517 u16 ec
= EC(m
->status
);
518 u8 xec
= XEC(m
->status
, 0x1f);
521 pr_emerg(HW_ERR
"MC4 Error (node %d): ", node_id
);
526 /* special handling for DRAM ECCs */
527 if (xec
== 0x0 || xec
== 0x8) {
528 /* no ECCs on F11h */
532 pr_cont("%s.\n", mc4_mce_desc
[xec
]);
535 nb_bus_decoder(node_id
, m
);
542 pr_cont("GART Table Walk data error.\n");
543 else if (BUS_ERROR(ec
))
544 pr_cont("DMA Exclusion Vector Table Walk error.\n");
550 if (boot_cpu_data
.x86
== 0x15)
551 pr_cont("Compute Unit Data Error.\n");
564 pr_cont("%s.\n", mc4_mce_desc
[xec
- offset
]);
568 pr_emerg(HW_ERR
"Corrupted MC4 MCE info?\n");
571 static void decode_mc5_mce(struct mce
*m
)
573 struct cpuinfo_x86
*c
= &boot_cpu_data
;
574 u8 xec
= XEC(m
->status
, xec_mask
);
576 if (c
->x86
== 0xf || c
->x86
== 0x11)
579 pr_emerg(HW_ERR
"MC5 Error: ");
581 if (xec
== 0x0 || xec
== 0xc)
582 pr_cont("%s.\n", mc5_mce_desc
[xec
]);
584 pr_cont("%s parity error.\n", mc5_mce_desc
[xec
]);
591 pr_emerg(HW_ERR
"Corrupted MC5 MCE info?\n");
594 static void decode_mc6_mce(struct mce
*m
)
596 u8 xec
= XEC(m
->status
, xec_mask
);
598 pr_emerg(HW_ERR
"MC6 Error: ");
602 pr_cont("Free List");
606 pr_cont("Physical Register File");
610 pr_cont("Retire Queue");
614 pr_cont("Scheduler table");
618 pr_cont("Status Register File");
626 pr_cont(" parity error.\n");
631 pr_emerg(HW_ERR
"Corrupted MC6 MCE info?\n");
634 static inline void amd_decode_err_code(u16 ec
)
637 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
640 pr_cont(", mem/io: %s", II_MSG(ec
));
642 pr_cont(", tx: %s", TT_MSG(ec
));
644 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
645 pr_cont(", mem-tx: %s", R4_MSG(ec
));
648 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
655 * Filter out unwanted MCE signatures here.
657 static bool amd_filter_mce(struct mce
*m
)
659 u8 xec
= (m
->status
>> 16) & 0x1f;
662 * NB GART TLB error reporting is disabled by default.
664 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
670 static const char *decode_error_status(struct mce
*m
)
672 if (m
->status
& MCI_STATUS_UC
) {
673 if (m
->status
& MCI_STATUS_PCC
)
674 return "System Fatal error.";
675 if (m
->mcgstatus
& MCG_STATUS_RIPV
)
676 return "Uncorrected, software restartable error.";
677 return "Uncorrected, software containable error.";
680 if (m
->status
& MCI_STATUS_DEFERRED
)
681 return "Deferred error.";
683 return "Corrected error, no action required.";
686 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
688 struct mce
*m
= (struct mce
*)data
;
689 struct cpuinfo_x86
*c
= &cpu_data(m
->extcpu
);
692 if (amd_filter_mce(m
))
706 decode_f15_mc2_mce(m
);
731 pr_emerg(HW_ERR
"Error Status: %s\n", decode_error_status(m
));
733 pr_emerg(HW_ERR
"CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
735 c
->x86
, c
->x86_model
, c
->x86_mask
,
737 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
738 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
739 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
740 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
741 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
745 ((m
->status
& MCI_STATUS_DEFERRED
) ? "Deferred" : "-"),
746 ((m
->status
& MCI_STATUS_POISON
) ? "Poison" : "-"));
748 /* do the two bits[14:13] together */
749 ecc
= (m
->status
>> 45) & 0x3;
751 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
753 pr_cont("]: 0x%016llx\n", m
->status
);
755 if (m
->status
& MCI_STATUS_ADDRV
)
756 pr_emerg(HW_ERR
"MC%d_ADDR: 0x%016llx\n", m
->bank
, m
->addr
);
758 amd_decode_err_code(m
->status
& 0xffff);
762 EXPORT_SYMBOL_GPL(amd_decode_mce
);
764 static struct notifier_block amd_mce_dec_nb
= {
765 .notifier_call
= amd_decode_mce
,
768 static int __init
mce_amd_init(void)
770 struct cpuinfo_x86
*c
= &boot_cpu_data
;
772 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
775 if (c
->x86
< 0xf || c
->x86
> 0x15)
778 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
784 fam_ops
->mc0_mce
= k8_mc0_mce
;
785 fam_ops
->mc1_mce
= k8_mc1_mce
;
789 fam_ops
->mc0_mce
= f10h_mc0_mce
;
790 fam_ops
->mc1_mce
= k8_mc1_mce
;
794 fam_ops
->mc0_mce
= k8_mc0_mce
;
795 fam_ops
->mc1_mce
= k8_mc1_mce
;
799 fam_ops
->mc0_mce
= f12h_mc0_mce
;
800 fam_ops
->mc1_mce
= k8_mc1_mce
;
804 nb_err_cpumask
= 0x3;
805 fam_ops
->mc0_mce
= f14h_mc0_mce
;
806 fam_ops
->mc1_mce
= f14h_mc1_mce
;
811 fam_ops
->mc0_mce
= f15h_mc0_mce
;
812 fam_ops
->mc1_mce
= f15h_mc1_mce
;
816 printk(KERN_WARNING
"Huh? What family is it: 0x%x?!\n", c
->x86
);
821 pr_info("MCE: In-kernel MCE decoding enabled.\n");
823 mce_register_decode_chain(&amd_mce_dec_nb
);
827 early_initcall(mce_amd_init
);
830 static void __exit
mce_amd_exit(void)
832 mce_unregister_decode_chain(&amd_mce_dec_nb
);
836 MODULE_DESCRIPTION("AMD MCE decoder");
837 MODULE_ALIAS("edac-mce-amd");
838 MODULE_LICENSE("GPL");
839 module_exit(mce_amd_exit
);