1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
, u32 nbcfg
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs
);
46 const char *ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs
);
49 /* memory transaction type */
50 const char *rrrr_msgs
[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs
);
55 /* participating processor */
56 const char *pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs
);
60 const char *to_msgs
[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs
);
64 const char *ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs
);
67 static const char *f10h_nb_mce_desc
[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc
[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc
[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc
[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec
, u8 xec
)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll
== LL_L1
)
142 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
149 static bool f10h_dc_mce(u16 ec
, u8 xec
)
151 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
152 pr_cont("during data scrub.\n");
155 return f12h_dc_mce(ec
, xec
);
158 static bool k8_dc_mce(u16 ec
, u8 xec
)
161 pr_cont("during system linefill.\n");
165 return f10h_dc_mce(ec
, xec
);
168 static bool f14h_dc_mce(u16 ec
, u8 xec
)
175 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
181 pr_cont("Data/Tag parity error due to %s.\n",
182 (r4
== R4_DRD
? "load/hw prf" : "store"));
185 pr_cont("Copyback parity error on a tag miss.\n");
188 pr_cont("Tag parity error during snoop.\n");
193 } else if (BUS_ERROR(ec
)) {
195 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
198 pr_cont("System read data error on a ");
202 pr_cont("TLB reload.\n");
220 static bool f15h_dc_mce(u16 ec
, u8 xec
)
228 pr_cont("Data Array access error.\n");
232 pr_cont("UC error during a linefill from L2/NB.\n");
237 pr_cont("STQ access error.\n");
241 pr_cont("SCB access error.\n");
245 pr_cont("Tag error.\n");
249 pr_cont("LDQ access error.\n");
255 } else if (BUS_ERROR(ec
)) {
258 pr_cont("during system linefill.\n");
260 pr_cont(" Internal %s condition.\n",
261 ((xec
== 1) ? "livelock" : "deadlock"));
268 static void amd_decode_dc_mce(struct mce
*m
)
270 u16 ec
= EC(m
->status
);
271 u8 xec
= XEC(m
->status
, xec_mask
);
273 pr_emerg(HW_ERR
"Data Cache Error: ");
275 /* TLB error signatures are the same across families */
277 if (TT(ec
) == TT_DATA
) {
278 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
279 ((xec
== 2) ? "locked miss"
280 : (xec
? "multimatch" : "parity")));
283 } else if (fam_ops
->dc_mce(ec
, xec
))
286 pr_emerg(HW_ERR
"Corrupted DC MCE info?\n");
289 static bool k8_ic_mce(u16 ec
, u8 xec
)
298 pr_cont("during a linefill from L2.\n");
299 else if (ll
== 0x1) {
302 pr_cont("Parity error during data load.\n");
306 pr_cont("Copyback Parity/Victim error.\n");
310 pr_cont("Tag Snoop error.\n");
323 static bool f14h_ic_mce(u16 ec
, u8 xec
)
329 if (TT(ec
) != 0 || LL(ec
) != 1)
333 pr_cont("Data/tag array parity error for a tag hit.\n");
334 else if (r4
== R4_SNOOP
)
335 pr_cont("Tag error during snoop/victimization.\n");
342 static bool f15h_ic_mce(u16 ec
, u8 xec
)
351 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
]);
355 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
-2]);
359 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc
[xec
-4]);
368 static void amd_decode_ic_mce(struct mce
*m
)
370 u16 ec
= EC(m
->status
);
371 u8 xec
= XEC(m
->status
, xec_mask
);
373 pr_emerg(HW_ERR
"Instruction Cache Error: ");
376 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
377 (xec
? "multimatch" : "parity error"));
378 else if (BUS_ERROR(ec
)) {
379 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
381 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
382 } else if (fam_ops
->ic_mce(ec
, xec
))
385 pr_emerg(HW_ERR
"Corrupted IC MCE info?\n");
388 static void amd_decode_bu_mce(struct mce
*m
)
390 u16 ec
= EC(m
->status
);
391 u8 xec
= XEC(m
->status
, xec_mask
);
393 pr_emerg(HW_ERR
"Bus Unit Error");
396 pr_cont(" in the write data buffers.\n");
398 pr_cont(" in the victim data buffers.\n");
399 else if (xec
== 0x2 && MEM_ERROR(ec
))
400 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
401 else if (xec
== 0x0) {
403 pr_cont(": %s error in a Page Descriptor Cache or "
404 "Guest TLB.\n", TT_MSG(ec
));
405 else if (BUS_ERROR(ec
))
406 pr_cont(": %s/ECC error in data read from NB: %s.\n",
407 R4_MSG(ec
), PP_MSG(ec
));
408 else if (MEM_ERROR(ec
)) {
412 pr_cont(": %s error during data copyback.\n",
415 pr_cont(": %s parity/ECC error during data "
416 "access from L2.\n", R4_MSG(ec
));
427 pr_emerg(HW_ERR
"Corrupted BU MCE info?\n");
430 static void amd_decode_cu_mce(struct mce
*m
)
432 u16 ec
= EC(m
->status
);
433 u8 xec
= XEC(m
->status
, xec_mask
);
435 pr_emerg(HW_ERR
"Combined Unit Error: ");
439 pr_cont("Data parity TLB read error.\n");
441 pr_cont("Poison data provided for TLB fill.\n");
444 } else if (BUS_ERROR(ec
)) {
448 pr_cont("Error during attempted NB data read.\n");
449 } else if (MEM_ERROR(ec
)) {
452 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x4]);
456 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x7]);
467 pr_emerg(HW_ERR
"Corrupted CU MCE info?\n");
470 static void amd_decode_ls_mce(struct mce
*m
)
472 u16 ec
= EC(m
->status
);
473 u8 xec
= XEC(m
->status
, xec_mask
);
475 if (boot_cpu_data
.x86
>= 0x14) {
476 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
477 " please report on LKML.\n");
481 pr_emerg(HW_ERR
"Load Store Error");
486 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
489 pr_cont(" during %s.\n", R4_MSG(ec
));
496 pr_emerg(HW_ERR
"Corrupted LS MCE info?\n");
499 static bool k8_nb_mce(u16 ec
, u8 xec
)
505 pr_cont("CRC error detected on HT link.\n");
509 pr_cont("Invalid GART PTE entry during GART table walk.\n");
513 pr_cont("Unsupported atomic RMW received from an IO link.\n");
518 if (boot_cpu_data
.x86
== 0x11)
521 pr_cont("DRAM ECC error detected on the NB.\n");
525 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
536 static bool f10h_nb_mce(u16 ec
, u8 xec
)
541 if (k8_nb_mce(ec
, xec
))
555 pr_cont("GART Table Walk data error.\n");
556 else if (BUS_ERROR(ec
))
557 pr_cont("DMA Exclusion Vector Table Walk error.\n");
565 if (boot_cpu_data
.x86
== 0x15)
566 pr_cont("Compute Unit Data Error.\n");
584 pr_cont("%s.\n", f10h_nb_mce_desc
[xec
- offset
]);
590 static bool nb_noop_mce(u16 ec
, u8 xec
)
595 void amd_decode_nb_mce(int node_id
, struct mce
*m
, u32 nbcfg
)
597 u16 ec
= EC(m
->status
);
598 u8 xec
= XEC(m
->status
, 0x1f);
599 u32 nbsh
= (u32
)(m
->status
>> 32);
602 pr_emerg(HW_ERR
"Northbridge Error (node %d", node_id
);
604 /* F10h, revD can disable ErrCpu[3:0] through ErrCpuVal */
605 if ((boot_cpu_data
.x86
== 0x10) &&
606 (boot_cpu_data
.x86_model
> 7)) {
607 if (nbsh
& K8_NBSH_ERR_CPU_VAL
)
608 core
= nbsh
& nb_err_cpumask
;
610 u8 assoc_cpus
= nbsh
& nb_err_cpumask
;
613 core
= fls(assoc_cpus
) - 1;
617 pr_cont(", core %d): ", core
);
623 pr_cont("Sync error (sync packets on HT link detected).\n");
627 pr_cont("HT Master abort.\n");
631 pr_cont("HT Target abort.\n");
635 pr_cont("NB Watchdog timeout.\n");
639 pr_cont("SVM DMA Exclusion Vector error.\n");
646 if (!fam_ops
->nb_mce(ec
, xec
))
649 if (boot_cpu_data
.x86
== 0xf || boot_cpu_data
.x86
== 0x10)
650 if ((xec
== 0x8 || xec
== 0x0) && nb_bus_decoder
)
651 nb_bus_decoder(node_id
, m
, nbcfg
);
656 pr_emerg(HW_ERR
"Corrupted NB MCE info?\n");
658 EXPORT_SYMBOL_GPL(amd_decode_nb_mce
);
660 static void amd_decode_fr_mce(struct mce
*m
)
662 struct cpuinfo_x86
*c
= &boot_cpu_data
;
663 u8 xec
= XEC(m
->status
, xec_mask
);
665 if (c
->x86
== 0xf || c
->x86
== 0x11)
668 if (c
->x86
!= 0x15 && xec
!= 0x0)
671 pr_emerg(HW_ERR
"%s Error: ",
672 (c
->x86
== 0x15 ? "Execution Unit" : "FIROB"));
674 if (xec
== 0x0 || xec
== 0xc)
675 pr_cont("%s.\n", fr_ex_mce_desc
[xec
]);
677 pr_cont("%s parity error.\n", fr_ex_mce_desc
[xec
]);
684 pr_emerg(HW_ERR
"Corrupted FR MCE info?\n");
687 static void amd_decode_fp_mce(struct mce
*m
)
689 u8 xec
= XEC(m
->status
, xec_mask
);
691 pr_emerg(HW_ERR
"Floating Point Unit Error: ");
695 pr_cont("Free List");
699 pr_cont("Physical Register File");
703 pr_cont("Retire Queue");
707 pr_cont("Scheduler table");
711 pr_cont("Status Register File");
719 pr_cont(" parity error.\n");
724 pr_emerg(HW_ERR
"Corrupted FP MCE info?\n");
727 static inline void amd_decode_err_code(u16 ec
)
730 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
733 pr_cont(", mem/io: %s", II_MSG(ec
));
735 pr_cont(", tx: %s", TT_MSG(ec
));
737 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
738 pr_cont(", mem-tx: %s", R4_MSG(ec
));
741 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
748 * Filter out unwanted MCE signatures here.
750 static bool amd_filter_mce(struct mce
*m
)
752 u8 xec
= (m
->status
>> 16) & 0x1f;
755 * NB GART TLB error reporting is disabled by default.
757 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
763 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
765 struct mce
*m
= (struct mce
*)data
;
766 struct cpuinfo_x86
*c
= &boot_cpu_data
;
769 if (amd_filter_mce(m
))
772 pr_emerg(HW_ERR
"MC%d_STATUS[%s|%s|%s|%s|%s",
774 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
775 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
776 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
777 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
778 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
782 ((m
->status
& BIT_64(44)) ? "Deferred" : "-"),
783 ((m
->status
& BIT_64(43)) ? "Poison" : "-"));
785 /* do the two bits[14:13] together */
786 ecc
= (m
->status
>> 45) & 0x3;
788 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
790 pr_cont("]: 0x%016llx\n", m
->status
);
795 amd_decode_dc_mce(m
);
799 amd_decode_ic_mce(m
);
804 amd_decode_cu_mce(m
);
806 amd_decode_bu_mce(m
);
810 amd_decode_ls_mce(m
);
814 node
= amd_get_nb_id(m
->extcpu
);
815 amd_decode_nb_mce(node
, m
, 0);
819 amd_decode_fr_mce(m
);
823 amd_decode_fp_mce(m
);
830 amd_decode_err_code(m
->status
& 0xffff);
834 EXPORT_SYMBOL_GPL(amd_decode_mce
);
836 static struct notifier_block amd_mce_dec_nb
= {
837 .notifier_call
= amd_decode_mce
,
840 static int __init
mce_amd_init(void)
842 struct cpuinfo_x86
*c
= &boot_cpu_data
;
844 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
847 if ((c
->x86
< 0xf || c
->x86
> 0x12) &&
848 (c
->x86
!= 0x14 || c
->x86_model
> 0xf) &&
849 (c
->x86
!= 0x15 || c
->x86_model
> 0xf))
852 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
858 fam_ops
->dc_mce
= k8_dc_mce
;
859 fam_ops
->ic_mce
= k8_ic_mce
;
860 fam_ops
->nb_mce
= k8_nb_mce
;
864 fam_ops
->dc_mce
= f10h_dc_mce
;
865 fam_ops
->ic_mce
= k8_ic_mce
;
866 fam_ops
->nb_mce
= f10h_nb_mce
;
870 fam_ops
->dc_mce
= k8_dc_mce
;
871 fam_ops
->ic_mce
= k8_ic_mce
;
872 fam_ops
->nb_mce
= f10h_nb_mce
;
876 fam_ops
->dc_mce
= f12h_dc_mce
;
877 fam_ops
->ic_mce
= k8_ic_mce
;
878 fam_ops
->nb_mce
= nb_noop_mce
;
882 nb_err_cpumask
= 0x3;
883 fam_ops
->dc_mce
= f14h_dc_mce
;
884 fam_ops
->ic_mce
= f14h_ic_mce
;
885 fam_ops
->nb_mce
= nb_noop_mce
;
890 fam_ops
->dc_mce
= f15h_dc_mce
;
891 fam_ops
->ic_mce
= f15h_ic_mce
;
892 fam_ops
->nb_mce
= f10h_nb_mce
;
896 printk(KERN_WARNING
"Huh? What family is that: %d?!\n", c
->x86
);
901 pr_info("MCE: In-kernel MCE decoding enabled.\n");
903 atomic_notifier_chain_register(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
907 early_initcall(mce_amd_init
);
910 static void __exit
mce_amd_exit(void)
912 atomic_notifier_chain_unregister(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
916 MODULE_DESCRIPTION("AMD MCE decoder");
917 MODULE_ALIAS("edac-mce-amd");
918 MODULE_LICENSE("GPL");
919 module_exit(mce_amd_exit
);