1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
10 static bool report_gart_errors
;
11 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
);
13 void amd_report_gart_errors(bool v
)
15 report_gart_errors
= v
;
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
19 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*))
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
25 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*))
28 WARN_ON(nb_bus_decoder
!= f
);
30 nb_bus_decoder
= NULL
;
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
36 * string representation for the different MCA reported error types, see F3x48
40 /* transaction type */
41 static const char * const tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
44 static const char * const ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
46 /* memory transaction type */
47 static const char * const rrrr_msgs
[] = {
48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 /* participating processor */
52 const char * const pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs
);
56 static const char * const to_msgs
[] = { "no timeout", "timed out" };
59 static const char * const ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
61 /* internal error type */
62 static const char * const uu_msgs
[] = { "RESV", "RESV", "HWA", "RESV" };
64 static const char * const f15h_mc1_mce_desc
[] = {
65 "UC during a demand linefill from L2",
66 "Parity error during data load from IC",
67 "Parity error for IC valid bit",
68 "Main tag parity error",
69 "Parity error in prediction queue",
70 "PFB data/address parity error",
71 "Parity error in the branch status reg",
72 "PFB promotion address error",
73 "Tag error during probe/victimization",
74 "Parity error for IC probe tag valid bit",
75 "PFB non-cacheable bit parity error",
76 "PFB valid bit parity error", /* xec = 0xd */
77 "Microcode Patch Buffer", /* xec = 010 */
85 static const char * const f15h_mc2_mce_desc
[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
99 "PRB address parity error"
102 static const char * const mc4_mce_desc
[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
124 static const char * const mc5_mce_desc
[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
138 "Retire status queue"
141 static const char * const mc6_mce_desc
[] = {
142 "Hardware Assertion",
144 "Physical Register File",
147 "Status Register File",
150 static bool f12h_mc0_mce(u16 ec
, u8 xec
)
159 pr_cont("during L1 linefill from L2.\n");
160 else if (ll
== LL_L1
)
161 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
168 static bool f10h_mc0_mce(u16 ec
, u8 xec
)
170 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
171 pr_cont("during data scrub.\n");
174 return f12h_mc0_mce(ec
, xec
);
177 static bool k8_mc0_mce(u16 ec
, u8 xec
)
180 pr_cont("during system linefill.\n");
184 return f10h_mc0_mce(ec
, xec
);
187 static bool cat_mc0_mce(u16 ec
, u8 xec
)
194 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
200 pr_cont("Data/Tag parity error due to %s.\n",
201 (r4
== R4_DRD
? "load/hw prf" : "store"));
204 pr_cont("Copyback parity error on a tag miss.\n");
207 pr_cont("Tag parity error during snoop.\n");
212 } else if (BUS_ERROR(ec
)) {
214 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
217 pr_cont("System read data error on a ");
221 pr_cont("TLB reload.\n");
239 static bool f15h_mc0_mce(u16 ec
, u8 xec
)
247 pr_cont("Data Array access error.\n");
251 pr_cont("UC error during a linefill from L2/NB.\n");
256 pr_cont("STQ access error.\n");
260 pr_cont("SCB access error.\n");
264 pr_cont("Tag error.\n");
268 pr_cont("LDQ access error.\n");
274 } else if (BUS_ERROR(ec
)) {
277 pr_cont("System Read Data Error.\n");
279 pr_cont(" Internal error condition type %d.\n", xec
);
280 } else if (INT_ERROR(ec
)) {
282 pr_cont("Hardware Assert.\n");
292 static void decode_mc0_mce(struct mce
*m
)
294 u16 ec
= EC(m
->status
);
295 u8 xec
= XEC(m
->status
, xec_mask
);
297 pr_emerg(HW_ERR
"MC0 Error: ");
299 /* TLB error signatures are the same across families */
301 if (TT(ec
) == TT_DATA
) {
302 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
303 ((xec
== 2) ? "locked miss"
304 : (xec
? "multimatch" : "parity")));
307 } else if (fam_ops
->mc0_mce(ec
, xec
))
310 pr_emerg(HW_ERR
"Corrupted MC0 MCE info?\n");
313 static bool k8_mc1_mce(u16 ec
, u8 xec
)
322 pr_cont("during a linefill from L2.\n");
323 else if (ll
== 0x1) {
326 pr_cont("Parity error during data load.\n");
330 pr_cont("Copyback Parity/Victim error.\n");
334 pr_cont("Tag Snoop error.\n");
347 static bool cat_mc1_mce(u16 ec
, u8 xec
)
355 if (TT(ec
) != TT_INSTR
)
359 pr_cont("Data/tag array parity error for a tag hit.\n");
360 else if (r4
== R4_SNOOP
)
361 pr_cont("Tag error during snoop/victimization.\n");
363 pr_cont("Tag parity error from victim castout.\n");
365 pr_cont("Microcode patch RAM parity error.\n");
372 static bool f15h_mc1_mce(u16 ec
, u8 xec
)
381 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
]);
385 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-2]);
389 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-4]);
393 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc
[xec
-4]);
402 static void decode_mc1_mce(struct mce
*m
)
404 u16 ec
= EC(m
->status
);
405 u8 xec
= XEC(m
->status
, xec_mask
);
407 pr_emerg(HW_ERR
"MC1 Error: ");
410 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
411 (xec
? "multimatch" : "parity error"));
412 else if (BUS_ERROR(ec
)) {
413 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
415 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
416 } else if (INT_ERROR(ec
)) {
418 pr_cont("Hardware Assert.\n");
421 } else if (fam_ops
->mc1_mce(ec
, xec
))
429 pr_emerg(HW_ERR
"Corrupted MC1 MCE info?\n");
432 static bool k8_mc2_mce(u16 ec
, u8 xec
)
437 pr_cont(" in the write data buffers.\n");
439 pr_cont(" in the victim data buffers.\n");
440 else if (xec
== 0x2 && MEM_ERROR(ec
))
441 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
442 else if (xec
== 0x0) {
444 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
446 else if (BUS_ERROR(ec
))
447 pr_cont(": %s/ECC error in data read from NB: %s.\n",
448 R4_MSG(ec
), PP_MSG(ec
));
449 else if (MEM_ERROR(ec
)) {
453 pr_cont(": %s error during data copyback.\n",
456 pr_cont(": %s parity/ECC error during data "
457 "access from L2.\n", R4_MSG(ec
));
468 static bool f15h_mc2_mce(u16 ec
, u8 xec
)
474 pr_cont("Data parity TLB read error.\n");
476 pr_cont("Poison data provided for TLB fill.\n");
479 } else if (BUS_ERROR(ec
)) {
483 pr_cont("Error during attempted NB data read.\n");
484 } else if (MEM_ERROR(ec
)) {
487 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x4]);
491 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x7]);
497 } else if (INT_ERROR(ec
)) {
499 pr_cont("Hardware Assert.\n");
507 static bool f16h_mc2_mce(u16 ec
, u8 xec
)
516 pr_cont("%cBUFF parity error.\n", (r4
== R4_RD
) ? 'I' : 'O');
521 pr_cont("ECC error in L2 tag (%s).\n",
522 ((r4
== R4_GEN
) ? "BankReq" :
523 ((r4
== R4_SNOOP
) ? "Prb" : "Fill")));
528 pr_cont("ECC error in L2 data array (%s).\n",
529 (((r4
== R4_RD
) && !(xec
& 0x3)) ? "Hit" :
530 ((r4
== R4_GEN
) ? "Attr" :
531 ((r4
== R4_EVICT
) ? "Vict" : "Fill"))));
536 pr_cont("Parity error in L2 attribute bits (%s).\n",
537 ((r4
== R4_RD
) ? "Hit" :
538 ((r4
== R4_GEN
) ? "Attr" : "Fill")));
548 static void decode_mc2_mce(struct mce
*m
)
550 u16 ec
= EC(m
->status
);
551 u8 xec
= XEC(m
->status
, xec_mask
);
553 pr_emerg(HW_ERR
"MC2 Error: ");
555 if (!fam_ops
->mc2_mce(ec
, xec
))
556 pr_cont(HW_ERR
"Corrupted MC2 MCE info?\n");
559 static void decode_mc3_mce(struct mce
*m
)
561 u16 ec
= EC(m
->status
);
562 u8 xec
= XEC(m
->status
, xec_mask
);
564 if (boot_cpu_data
.x86
>= 0x14) {
565 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
566 " please report on LKML.\n");
570 pr_emerg(HW_ERR
"MC3 Error");
575 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
578 pr_cont(" during %s.\n", R4_MSG(ec
));
585 pr_emerg(HW_ERR
"Corrupted MC3 MCE info?\n");
588 static void decode_mc4_mce(struct mce
*m
)
590 struct cpuinfo_x86
*c
= &boot_cpu_data
;
591 int node_id
= amd_get_nb_id(m
->extcpu
);
592 u16 ec
= EC(m
->status
);
593 u8 xec
= XEC(m
->status
, 0x1f);
596 pr_emerg(HW_ERR
"MC4 Error (node %d): ", node_id
);
601 /* special handling for DRAM ECCs */
602 if (xec
== 0x0 || xec
== 0x8) {
603 /* no ECCs on F11h */
607 pr_cont("%s.\n", mc4_mce_desc
[xec
]);
610 nb_bus_decoder(node_id
, m
);
617 pr_cont("GART Table Walk data error.\n");
618 else if (BUS_ERROR(ec
))
619 pr_cont("DMA Exclusion Vector Table Walk error.\n");
625 if (boot_cpu_data
.x86
== 0x15 || boot_cpu_data
.x86
== 0x16)
626 pr_cont("Compute Unit Data Error.\n");
639 pr_cont("%s.\n", mc4_mce_desc
[xec
- offset
]);
643 pr_emerg(HW_ERR
"Corrupted MC4 MCE info?\n");
646 static void decode_mc5_mce(struct mce
*m
)
648 struct cpuinfo_x86
*c
= &boot_cpu_data
;
649 u16 ec
= EC(m
->status
);
650 u8 xec
= XEC(m
->status
, xec_mask
);
652 if (c
->x86
== 0xf || c
->x86
== 0x11)
655 pr_emerg(HW_ERR
"MC5 Error: ");
659 pr_cont("Hardware Assert.\n");
665 if (xec
== 0x0 || xec
== 0xc)
666 pr_cont("%s.\n", mc5_mce_desc
[xec
]);
668 pr_cont("%s parity error.\n", mc5_mce_desc
[xec
]);
675 pr_emerg(HW_ERR
"Corrupted MC5 MCE info?\n");
678 static void decode_mc6_mce(struct mce
*m
)
680 u8 xec
= XEC(m
->status
, xec_mask
);
682 pr_emerg(HW_ERR
"MC6 Error: ");
687 pr_cont("%s parity error.\n", mc6_mce_desc
[xec
]);
691 pr_emerg(HW_ERR
"Corrupted MC6 MCE info?\n");
694 static inline void amd_decode_err_code(u16 ec
)
697 pr_emerg(HW_ERR
"internal: %s\n", UU_MSG(ec
));
701 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
704 pr_cont(", mem/io: %s", II_MSG(ec
));
706 pr_cont(", tx: %s", TT_MSG(ec
));
708 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
709 pr_cont(", mem-tx: %s", R4_MSG(ec
));
712 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
719 * Filter out unwanted MCE signatures here.
721 static bool amd_filter_mce(struct mce
*m
)
723 u8 xec
= (m
->status
>> 16) & 0x1f;
726 * NB GART TLB error reporting is disabled by default.
728 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
734 static const char *decode_error_status(struct mce
*m
)
736 if (m
->status
& MCI_STATUS_UC
) {
737 if (m
->status
& MCI_STATUS_PCC
)
738 return "System Fatal error.";
739 if (m
->mcgstatus
& MCG_STATUS_RIPV
)
740 return "Uncorrected, software restartable error.";
741 return "Uncorrected, software containable error.";
744 if (m
->status
& MCI_STATUS_DEFERRED
)
745 return "Deferred error.";
747 return "Corrected error, no action required.";
750 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
752 struct mce
*m
= (struct mce
*)data
;
753 struct cpuinfo_x86
*c
= &cpu_data(m
->extcpu
);
756 if (amd_filter_mce(m
))
759 pr_emerg(HW_ERR
"%s\n", decode_error_status(m
));
761 pr_emerg(HW_ERR
"CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
763 c
->x86
, c
->x86_model
, c
->x86_mask
,
765 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
766 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
767 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
768 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
769 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
771 if (c
->x86
== 0x15 || c
->x86
== 0x16)
773 ((m
->status
& MCI_STATUS_DEFERRED
) ? "Deferred" : "-"),
774 ((m
->status
& MCI_STATUS_POISON
) ? "Poison" : "-"));
776 /* do the two bits[14:13] together */
777 ecc
= (m
->status
>> 45) & 0x3;
779 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
781 pr_cont("]: 0x%016llx\n", m
->status
);
783 if (m
->status
& MCI_STATUS_ADDRV
)
784 pr_emerg(HW_ERR
"MC%d Error Address: 0x%016llx\n", m
->bank
, m
->addr
);
823 amd_decode_err_code(m
->status
& 0xffff);
827 EXPORT_SYMBOL_GPL(amd_decode_mce
);
829 static struct notifier_block amd_mce_dec_nb
= {
830 .notifier_call
= amd_decode_mce
,
833 static int __init
mce_amd_init(void)
835 struct cpuinfo_x86
*c
= &boot_cpu_data
;
837 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
840 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
846 fam_ops
->mc0_mce
= k8_mc0_mce
;
847 fam_ops
->mc1_mce
= k8_mc1_mce
;
848 fam_ops
->mc2_mce
= k8_mc2_mce
;
852 fam_ops
->mc0_mce
= f10h_mc0_mce
;
853 fam_ops
->mc1_mce
= k8_mc1_mce
;
854 fam_ops
->mc2_mce
= k8_mc2_mce
;
858 fam_ops
->mc0_mce
= k8_mc0_mce
;
859 fam_ops
->mc1_mce
= k8_mc1_mce
;
860 fam_ops
->mc2_mce
= k8_mc2_mce
;
864 fam_ops
->mc0_mce
= f12h_mc0_mce
;
865 fam_ops
->mc1_mce
= k8_mc1_mce
;
866 fam_ops
->mc2_mce
= k8_mc2_mce
;
870 fam_ops
->mc0_mce
= cat_mc0_mce
;
871 fam_ops
->mc1_mce
= cat_mc1_mce
;
872 fam_ops
->mc2_mce
= k8_mc2_mce
;
876 xec_mask
= c
->x86_model
== 0x60 ? 0x3f : 0x1f;
878 fam_ops
->mc0_mce
= f15h_mc0_mce
;
879 fam_ops
->mc1_mce
= f15h_mc1_mce
;
880 fam_ops
->mc2_mce
= f15h_mc2_mce
;
885 fam_ops
->mc0_mce
= cat_mc0_mce
;
886 fam_ops
->mc1_mce
= cat_mc1_mce
;
887 fam_ops
->mc2_mce
= f16h_mc2_mce
;
891 printk(KERN_WARNING
"Huh? What family is it: 0x%x?!\n", c
->x86
);
896 pr_info("MCE: In-kernel MCE decoding enabled.\n");
898 mce_register_decode_chain(&amd_mce_dec_nb
);
902 early_initcall(mce_amd_init
);
905 static void __exit
mce_amd_exit(void)
907 mce_unregister_decode_chain(&amd_mce_dec_nb
);
911 MODULE_DESCRIPTION("AMD MCE decoder");
912 MODULE_ALIAS("edac-mce-amd");
913 MODULE_LICENSE("GPL");
914 module_exit(mce_amd_exit
);