1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char * const tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs
);
46 const char * const ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs
);
49 /* memory transaction type */
50 const char * const rrrr_msgs
[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs
);
55 /* participating processor */
56 const char * const pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs
);
60 const char * const to_msgs
[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs
);
64 const char * const ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs
);
67 static const char * const f15h_ic_mce_desc
[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
87 static const char * const f15h_cu_mce_desc
[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
101 "PRB address parity error"
104 static const char * const nb_mce_desc
[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const fr_ex_mce_desc
[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
142 static bool f12h_dc_mce(u16 ec
, u8 xec
)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll
== LL_L1
)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
160 static bool f10h_dc_mce(u16 ec
, u8 xec
)
162 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
163 pr_cont("during data scrub.\n");
166 return f12h_dc_mce(ec
, xec
);
169 static bool k8_dc_mce(u16 ec
, u8 xec
)
172 pr_cont("during system linefill.\n");
176 return f10h_dc_mce(ec
, xec
);
179 static bool f14h_dc_mce(u16 ec
, u8 xec
)
186 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4
== R4_DRD
? "load/hw prf" : "store"));
196 pr_cont("Copyback parity error on a tag miss.\n");
199 pr_cont("Tag parity error during snoop.\n");
204 } else if (BUS_ERROR(ec
)) {
206 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
209 pr_cont("System read data error on a ");
213 pr_cont("TLB reload.\n");
231 static bool f15h_dc_mce(u16 ec
, u8 xec
)
239 pr_cont("Data Array access error.\n");
243 pr_cont("UC error during a linefill from L2/NB.\n");
248 pr_cont("STQ access error.\n");
252 pr_cont("SCB access error.\n");
256 pr_cont("Tag error.\n");
260 pr_cont("LDQ access error.\n");
266 } else if (BUS_ERROR(ec
)) {
269 pr_cont("System Read Data Error.\n");
271 pr_cont(" Internal error condition type %d.\n", xec
);
278 static void amd_decode_dc_mce(struct mce
*m
)
280 u16 ec
= EC(m
->status
);
281 u8 xec
= XEC(m
->status
, xec_mask
);
283 pr_emerg(HW_ERR
"Data Cache Error: ");
285 /* TLB error signatures are the same across families */
287 if (TT(ec
) == TT_DATA
) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
289 ((xec
== 2) ? "locked miss"
290 : (xec
? "multimatch" : "parity")));
293 } else if (fam_ops
->dc_mce(ec
, xec
))
296 pr_emerg(HW_ERR
"Corrupted DC MCE info?\n");
299 static bool k8_ic_mce(u16 ec
, u8 xec
)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll
== 0x1) {
312 pr_cont("Parity error during data load.\n");
316 pr_cont("Copyback Parity/Victim error.\n");
320 pr_cont("Tag Snoop error.\n");
333 static bool f14h_ic_mce(u16 ec
, u8 xec
)
339 if (TT(ec
) != 0 || LL(ec
) != 1)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4
== R4_SNOOP
)
345 pr_cont("Tag error during snoop/victimization.\n");
352 static bool f15h_ic_mce(u16 ec
, u8 xec
)
361 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
]);
365 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
-2]);
369 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
-4]);
373 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc
[xec
-4]);
382 static void amd_decode_ic_mce(struct mce
*m
)
384 u16 ec
= EC(m
->status
);
385 u8 xec
= XEC(m
->status
, xec_mask
);
387 pr_emerg(HW_ERR
"Instruction Cache Error: ");
390 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
391 (xec
? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec
)) {
393 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
395 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
396 } else if (fam_ops
->ic_mce(ec
, xec
))
399 pr_emerg(HW_ERR
"Corrupted IC MCE info?\n");
402 static void amd_decode_bu_mce(struct mce
*m
)
404 u16 ec
= EC(m
->status
);
405 u8 xec
= XEC(m
->status
, xec_mask
);
407 pr_emerg(HW_ERR
"Bus Unit Error");
410 pr_cont(" in the write data buffers.\n");
412 pr_cont(" in the victim data buffers.\n");
413 else if (xec
== 0x2 && MEM_ERROR(ec
))
414 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
415 else if (xec
== 0x0) {
417 pr_cont(": %s error in a Page Descriptor Cache or "
418 "Guest TLB.\n", TT_MSG(ec
));
419 else if (BUS_ERROR(ec
))
420 pr_cont(": %s/ECC error in data read from NB: %s.\n",
421 R4_MSG(ec
), PP_MSG(ec
));
422 else if (MEM_ERROR(ec
)) {
426 pr_cont(": %s error during data copyback.\n",
429 pr_cont(": %s parity/ECC error during data "
430 "access from L2.\n", R4_MSG(ec
));
441 pr_emerg(HW_ERR
"Corrupted BU MCE info?\n");
444 static void amd_decode_cu_mce(struct mce
*m
)
446 u16 ec
= EC(m
->status
);
447 u8 xec
= XEC(m
->status
, xec_mask
);
449 pr_emerg(HW_ERR
"Combined Unit Error: ");
453 pr_cont("Data parity TLB read error.\n");
455 pr_cont("Poison data provided for TLB fill.\n");
458 } else if (BUS_ERROR(ec
)) {
462 pr_cont("Error during attempted NB data read.\n");
463 } else if (MEM_ERROR(ec
)) {
466 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x4]);
470 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x7]);
481 pr_emerg(HW_ERR
"Corrupted CU MCE info?\n");
484 static void amd_decode_ls_mce(struct mce
*m
)
486 u16 ec
= EC(m
->status
);
487 u8 xec
= XEC(m
->status
, xec_mask
);
489 if (boot_cpu_data
.x86
>= 0x14) {
490 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
491 " please report on LKML.\n");
495 pr_emerg(HW_ERR
"Load Store Error");
500 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
503 pr_cont(" during %s.\n", R4_MSG(ec
));
510 pr_emerg(HW_ERR
"Corrupted LS MCE info?\n");
513 void amd_decode_nb_mce(struct mce
*m
)
515 struct cpuinfo_x86
*c
= &boot_cpu_data
;
516 int node_id
= amd_get_nb_id(m
->extcpu
);
517 u16 ec
= EC(m
->status
);
518 u8 xec
= XEC(m
->status
, 0x1f);
521 pr_emerg(HW_ERR
"Northbridge Error (node %d): ", node_id
);
526 /* special handling for DRAM ECCs */
527 if (xec
== 0x0 || xec
== 0x8) {
528 /* no ECCs on F11h */
532 pr_cont("%s.\n", nb_mce_desc
[xec
]);
535 nb_bus_decoder(node_id
, m
);
542 pr_cont("GART Table Walk data error.\n");
543 else if (BUS_ERROR(ec
))
544 pr_cont("DMA Exclusion Vector Table Walk error.\n");
550 if (boot_cpu_data
.x86
== 0x15)
551 pr_cont("Compute Unit Data Error.\n");
564 pr_cont("%s.\n", nb_mce_desc
[xec
- offset
]);
568 pr_emerg(HW_ERR
"Corrupted NB MCE info?\n");
570 EXPORT_SYMBOL_GPL(amd_decode_nb_mce
);
572 static void amd_decode_fr_mce(struct mce
*m
)
574 struct cpuinfo_x86
*c
= &boot_cpu_data
;
575 u8 xec
= XEC(m
->status
, xec_mask
);
577 if (c
->x86
== 0xf || c
->x86
== 0x11)
580 pr_emerg(HW_ERR
"%s Error: ",
581 (c
->x86
== 0x15 ? "Execution Unit" : "FIROB"));
583 if (xec
== 0x0 || xec
== 0xc)
584 pr_cont("%s.\n", fr_ex_mce_desc
[xec
]);
586 pr_cont("%s parity error.\n", fr_ex_mce_desc
[xec
]);
593 pr_emerg(HW_ERR
"Corrupted FR MCE info?\n");
596 static void amd_decode_fp_mce(struct mce
*m
)
598 u8 xec
= XEC(m
->status
, xec_mask
);
600 pr_emerg(HW_ERR
"Floating Point Unit Error: ");
604 pr_cont("Free List");
608 pr_cont("Physical Register File");
612 pr_cont("Retire Queue");
616 pr_cont("Scheduler table");
620 pr_cont("Status Register File");
628 pr_cont(" parity error.\n");
633 pr_emerg(HW_ERR
"Corrupted FP MCE info?\n");
636 static inline void amd_decode_err_code(u16 ec
)
639 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
642 pr_cont(", mem/io: %s", II_MSG(ec
));
644 pr_cont(", tx: %s", TT_MSG(ec
));
646 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
647 pr_cont(", mem-tx: %s", R4_MSG(ec
));
650 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
657 * Filter out unwanted MCE signatures here.
659 static bool amd_filter_mce(struct mce
*m
)
661 u8 xec
= (m
->status
>> 16) & 0x1f;
664 * NB GART TLB error reporting is disabled by default.
666 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
672 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
674 struct mce
*m
= (struct mce
*)data
;
675 struct cpuinfo_x86
*c
= &boot_cpu_data
;
678 if (amd_filter_mce(m
))
681 pr_emerg(HW_ERR
"CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s",
683 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
684 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
685 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
686 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
687 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
691 ((m
->status
& BIT_64(44)) ? "Deferred" : "-"),
692 ((m
->status
& BIT_64(43)) ? "Poison" : "-"));
694 /* do the two bits[14:13] together */
695 ecc
= (m
->status
>> 45) & 0x3;
697 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
699 pr_cont("]: 0x%016llx\n", m
->status
);
701 if (m
->status
& MCI_STATUS_ADDRV
)
702 pr_emerg(HW_ERR
"\tMC%d_ADDR: 0x%016llx\n", m
->bank
, m
->addr
);
706 amd_decode_dc_mce(m
);
710 amd_decode_ic_mce(m
);
715 amd_decode_cu_mce(m
);
717 amd_decode_bu_mce(m
);
721 amd_decode_ls_mce(m
);
725 amd_decode_nb_mce(m
);
729 amd_decode_fr_mce(m
);
733 amd_decode_fp_mce(m
);
740 amd_decode_err_code(m
->status
& 0xffff);
744 EXPORT_SYMBOL_GPL(amd_decode_mce
);
746 static struct notifier_block amd_mce_dec_nb
= {
747 .notifier_call
= amd_decode_mce
,
750 static int __init
mce_amd_init(void)
752 struct cpuinfo_x86
*c
= &boot_cpu_data
;
754 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
757 if (c
->x86
< 0xf || c
->x86
> 0x15)
760 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
766 fam_ops
->dc_mce
= k8_dc_mce
;
767 fam_ops
->ic_mce
= k8_ic_mce
;
771 fam_ops
->dc_mce
= f10h_dc_mce
;
772 fam_ops
->ic_mce
= k8_ic_mce
;
776 fam_ops
->dc_mce
= k8_dc_mce
;
777 fam_ops
->ic_mce
= k8_ic_mce
;
781 fam_ops
->dc_mce
= f12h_dc_mce
;
782 fam_ops
->ic_mce
= k8_ic_mce
;
786 nb_err_cpumask
= 0x3;
787 fam_ops
->dc_mce
= f14h_dc_mce
;
788 fam_ops
->ic_mce
= f14h_ic_mce
;
793 fam_ops
->dc_mce
= f15h_dc_mce
;
794 fam_ops
->ic_mce
= f15h_ic_mce
;
798 printk(KERN_WARNING
"Huh? What family is it: 0x%x?!\n", c
->x86
);
803 pr_info("MCE: In-kernel MCE decoding enabled.\n");
805 mce_register_decode_chain(&amd_mce_dec_nb
);
809 early_initcall(mce_amd_init
);
812 static void __exit
mce_amd_exit(void)
814 mce_unregister_decode_chain(&amd_mce_dec_nb
);
818 MODULE_DESCRIPTION("AMD MCE decoder");
819 MODULE_ALIAS("edac-mce-amd");
820 MODULE_LICENSE("GPL");
821 module_exit(mce_amd_exit
);