1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs
);
46 const char *ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs
);
49 /* memory transaction type */
50 const char *rrrr_msgs
[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs
);
55 /* participating processor */
56 const char *pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs
);
60 const char *to_msgs
[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs
);
64 const char *ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs
);
67 static const char *f10h_nb_mce_desc
[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc
[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc
[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc
[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec
, u8 xec
)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll
== LL_L1
)
142 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
149 static bool f10h_dc_mce(u16 ec
, u8 xec
)
151 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
152 pr_cont("during data scrub.\n");
155 return f12h_dc_mce(ec
, xec
);
158 static bool k8_dc_mce(u16 ec
, u8 xec
)
161 pr_cont("during system linefill.\n");
165 return f10h_dc_mce(ec
, xec
);
168 static bool f14h_dc_mce(u16 ec
, u8 xec
)
175 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
181 pr_cont("Data/Tag parity error due to %s.\n",
182 (r4
== R4_DRD
? "load/hw prf" : "store"));
185 pr_cont("Copyback parity error on a tag miss.\n");
188 pr_cont("Tag parity error during snoop.\n");
193 } else if (BUS_ERROR(ec
)) {
195 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
198 pr_cont("System read data error on a ");
202 pr_cont("TLB reload.\n");
220 static bool f15h_dc_mce(u16 ec
, u8 xec
)
228 pr_cont("Data Array access error.\n");
232 pr_cont("UC error during a linefill from L2/NB.\n");
237 pr_cont("STQ access error.\n");
241 pr_cont("SCB access error.\n");
245 pr_cont("Tag error.\n");
249 pr_cont("LDQ access error.\n");
255 } else if (BUS_ERROR(ec
)) {
258 pr_cont("during system linefill.\n");
260 pr_cont(" Internal %s condition.\n",
261 ((xec
== 1) ? "livelock" : "deadlock"));
268 static void amd_decode_dc_mce(struct mce
*m
)
270 u16 ec
= EC(m
->status
);
271 u8 xec
= XEC(m
->status
, xec_mask
);
273 pr_emerg(HW_ERR
"Data Cache Error: ");
275 /* TLB error signatures are the same across families */
277 if (TT(ec
) == TT_DATA
) {
278 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
279 ((xec
== 2) ? "locked miss"
280 : (xec
? "multimatch" : "parity")));
283 } else if (fam_ops
->dc_mce(ec
, xec
))
286 pr_emerg(HW_ERR
"Corrupted DC MCE info?\n");
289 static bool k8_ic_mce(u16 ec
, u8 xec
)
298 pr_cont("during a linefill from L2.\n");
299 else if (ll
== 0x1) {
302 pr_cont("Parity error during data load.\n");
306 pr_cont("Copyback Parity/Victim error.\n");
310 pr_cont("Tag Snoop error.\n");
323 static bool f14h_ic_mce(u16 ec
, u8 xec
)
329 if (TT(ec
) != 0 || LL(ec
) != 1)
333 pr_cont("Data/tag array parity error for a tag hit.\n");
334 else if (r4
== R4_SNOOP
)
335 pr_cont("Tag error during snoop/victimization.\n");
342 static bool f15h_ic_mce(u16 ec
, u8 xec
)
351 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
]);
355 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
-2]);
359 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc
[xec
-4]);
368 static void amd_decode_ic_mce(struct mce
*m
)
370 u16 ec
= EC(m
->status
);
371 u8 xec
= XEC(m
->status
, xec_mask
);
373 pr_emerg(HW_ERR
"Instruction Cache Error: ");
376 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
377 (xec
? "multimatch" : "parity error"));
378 else if (BUS_ERROR(ec
)) {
379 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
381 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
382 } else if (fam_ops
->ic_mce(ec
, xec
))
385 pr_emerg(HW_ERR
"Corrupted IC MCE info?\n");
388 static void amd_decode_bu_mce(struct mce
*m
)
390 u16 ec
= EC(m
->status
);
391 u8 xec
= XEC(m
->status
, xec_mask
);
393 pr_emerg(HW_ERR
"Bus Unit Error");
396 pr_cont(" in the write data buffers.\n");
398 pr_cont(" in the victim data buffers.\n");
399 else if (xec
== 0x2 && MEM_ERROR(ec
))
400 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
401 else if (xec
== 0x0) {
403 pr_cont(": %s error in a Page Descriptor Cache or "
404 "Guest TLB.\n", TT_MSG(ec
));
405 else if (BUS_ERROR(ec
))
406 pr_cont(": %s/ECC error in data read from NB: %s.\n",
407 R4_MSG(ec
), PP_MSG(ec
));
408 else if (MEM_ERROR(ec
)) {
412 pr_cont(": %s error during data copyback.\n",
415 pr_cont(": %s parity/ECC error during data "
416 "access from L2.\n", R4_MSG(ec
));
427 pr_emerg(HW_ERR
"Corrupted BU MCE info?\n");
430 static void amd_decode_cu_mce(struct mce
*m
)
432 u16 ec
= EC(m
->status
);
433 u8 xec
= XEC(m
->status
, xec_mask
);
435 pr_emerg(HW_ERR
"Combined Unit Error: ");
439 pr_cont("Data parity TLB read error.\n");
441 pr_cont("Poison data provided for TLB fill.\n");
444 } else if (BUS_ERROR(ec
)) {
448 pr_cont("Error during attempted NB data read.\n");
449 } else if (MEM_ERROR(ec
)) {
452 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x4]);
456 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x7]);
467 pr_emerg(HW_ERR
"Corrupted CU MCE info?\n");
470 static void amd_decode_ls_mce(struct mce
*m
)
472 u16 ec
= EC(m
->status
);
473 u8 xec
= XEC(m
->status
, xec_mask
);
475 if (boot_cpu_data
.x86
>= 0x14) {
476 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
477 " please report on LKML.\n");
481 pr_emerg(HW_ERR
"Load Store Error");
486 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
489 pr_cont(" during %s.\n", R4_MSG(ec
));
496 pr_emerg(HW_ERR
"Corrupted LS MCE info?\n");
499 static bool k8_nb_mce(u16 ec
, u8 xec
)
505 pr_cont("CRC error detected on HT link.\n");
509 pr_cont("Invalid GART PTE entry during GART table walk.\n");
513 pr_cont("Unsupported atomic RMW received from an IO link.\n");
518 if (boot_cpu_data
.x86
== 0x11)
521 pr_cont("DRAM ECC error detected on the NB.\n");
525 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
536 static bool f10h_nb_mce(u16 ec
, u8 xec
)
541 if (k8_nb_mce(ec
, xec
))
555 pr_cont("GART Table Walk data error.\n");
556 else if (BUS_ERROR(ec
))
557 pr_cont("DMA Exclusion Vector Table Walk error.\n");
565 if (boot_cpu_data
.x86
== 0x15)
566 pr_cont("Compute Unit Data Error.\n");
584 pr_cont("%s.\n", f10h_nb_mce_desc
[xec
- offset
]);
590 static bool nb_noop_mce(u16 ec
, u8 xec
)
595 void amd_decode_nb_mce(struct mce
*m
)
597 struct cpuinfo_x86
*c
= &boot_cpu_data
;
598 int node_id
= amd_get_nb_id(m
->extcpu
);
599 u16 ec
= EC(m
->status
);
600 u8 xec
= XEC(m
->status
, 0x1f);
602 pr_emerg(HW_ERR
"Northbridge Error (node %d): ", node_id
);
606 pr_cont("Sync error (sync packets on HT link detected).\n");
610 pr_cont("HT Master abort.\n");
614 pr_cont("HT Target abort.\n");
618 pr_cont("NB Watchdog timeout.\n");
622 pr_cont("SVM DMA Exclusion Vector error.\n");
629 if (!fam_ops
->nb_mce(ec
, xec
))
632 if (c
->x86
== 0xf || c
->x86
== 0x10 || c
->x86
== 0x15)
633 if ((xec
== 0x8 || xec
== 0x0) && nb_bus_decoder
)
634 nb_bus_decoder(node_id
, m
);
639 pr_emerg(HW_ERR
"Corrupted NB MCE info?\n");
641 EXPORT_SYMBOL_GPL(amd_decode_nb_mce
);
643 static void amd_decode_fr_mce(struct mce
*m
)
645 struct cpuinfo_x86
*c
= &boot_cpu_data
;
646 u8 xec
= XEC(m
->status
, xec_mask
);
648 if (c
->x86
== 0xf || c
->x86
== 0x11)
651 if (c
->x86
!= 0x15 && xec
!= 0x0)
654 pr_emerg(HW_ERR
"%s Error: ",
655 (c
->x86
== 0x15 ? "Execution Unit" : "FIROB"));
657 if (xec
== 0x0 || xec
== 0xc)
658 pr_cont("%s.\n", fr_ex_mce_desc
[xec
]);
660 pr_cont("%s parity error.\n", fr_ex_mce_desc
[xec
]);
667 pr_emerg(HW_ERR
"Corrupted FR MCE info?\n");
670 static void amd_decode_fp_mce(struct mce
*m
)
672 u8 xec
= XEC(m
->status
, xec_mask
);
674 pr_emerg(HW_ERR
"Floating Point Unit Error: ");
678 pr_cont("Free List");
682 pr_cont("Physical Register File");
686 pr_cont("Retire Queue");
690 pr_cont("Scheduler table");
694 pr_cont("Status Register File");
702 pr_cont(" parity error.\n");
707 pr_emerg(HW_ERR
"Corrupted FP MCE info?\n");
710 static inline void amd_decode_err_code(u16 ec
)
713 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
716 pr_cont(", mem/io: %s", II_MSG(ec
));
718 pr_cont(", tx: %s", TT_MSG(ec
));
720 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
721 pr_cont(", mem-tx: %s", R4_MSG(ec
));
724 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
731 * Filter out unwanted MCE signatures here.
733 static bool amd_filter_mce(struct mce
*m
)
735 u8 xec
= (m
->status
>> 16) & 0x1f;
738 * NB GART TLB error reporting is disabled by default.
740 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
746 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
748 struct mce
*m
= (struct mce
*)data
;
749 struct cpuinfo_x86
*c
= &boot_cpu_data
;
752 if (amd_filter_mce(m
))
755 pr_emerg(HW_ERR
"CPU:%d MC%d_STATUS[%s|%s|%s|%s|%s",
757 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
758 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
759 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
760 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
761 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
765 ((m
->status
& BIT_64(44)) ? "Deferred" : "-"),
766 ((m
->status
& BIT_64(43)) ? "Poison" : "-"));
768 /* do the two bits[14:13] together */
769 ecc
= (m
->status
>> 45) & 0x3;
771 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
773 pr_cont("]: 0x%016llx\n", m
->status
);
778 amd_decode_dc_mce(m
);
782 amd_decode_ic_mce(m
);
787 amd_decode_cu_mce(m
);
789 amd_decode_bu_mce(m
);
793 amd_decode_ls_mce(m
);
797 amd_decode_nb_mce(m
);
801 amd_decode_fr_mce(m
);
805 amd_decode_fp_mce(m
);
812 amd_decode_err_code(m
->status
& 0xffff);
816 EXPORT_SYMBOL_GPL(amd_decode_mce
);
818 static struct notifier_block amd_mce_dec_nb
= {
819 .notifier_call
= amd_decode_mce
,
822 static int __init
mce_amd_init(void)
824 struct cpuinfo_x86
*c
= &boot_cpu_data
;
826 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
829 if ((c
->x86
< 0xf || c
->x86
> 0x12) &&
830 (c
->x86
!= 0x14 || c
->x86_model
> 0xf) &&
831 (c
->x86
!= 0x15 || c
->x86_model
> 0xf))
834 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
840 fam_ops
->dc_mce
= k8_dc_mce
;
841 fam_ops
->ic_mce
= k8_ic_mce
;
842 fam_ops
->nb_mce
= k8_nb_mce
;
846 fam_ops
->dc_mce
= f10h_dc_mce
;
847 fam_ops
->ic_mce
= k8_ic_mce
;
848 fam_ops
->nb_mce
= f10h_nb_mce
;
852 fam_ops
->dc_mce
= k8_dc_mce
;
853 fam_ops
->ic_mce
= k8_ic_mce
;
854 fam_ops
->nb_mce
= f10h_nb_mce
;
858 fam_ops
->dc_mce
= f12h_dc_mce
;
859 fam_ops
->ic_mce
= k8_ic_mce
;
860 fam_ops
->nb_mce
= nb_noop_mce
;
864 nb_err_cpumask
= 0x3;
865 fam_ops
->dc_mce
= f14h_dc_mce
;
866 fam_ops
->ic_mce
= f14h_ic_mce
;
867 fam_ops
->nb_mce
= nb_noop_mce
;
872 fam_ops
->dc_mce
= f15h_dc_mce
;
873 fam_ops
->ic_mce
= f15h_ic_mce
;
874 fam_ops
->nb_mce
= f10h_nb_mce
;
878 printk(KERN_WARNING
"Huh? What family is that: %d?!\n", c
->x86
);
883 pr_info("MCE: In-kernel MCE decoding enabled.\n");
885 atomic_notifier_chain_register(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
889 early_initcall(mce_amd_init
);
892 static void __exit
mce_amd_exit(void)
894 atomic_notifier_chain_unregister(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
898 MODULE_DESCRIPTION("AMD MCE decoder");
899 MODULE_ALIAS("edac-mce-amd");
900 MODULE_LICENSE("GPL");
901 module_exit(mce_amd_exit
);