1 // SPDX-License-Identifier: GPL-2.0
4 * Shared code by both skx_edac and i10nm_edac. Originally split out
5 * from the skx_edac driver.
7 * This file is linked into both skx_edac and i10nm_edac drivers. In
8 * order to avoid link errors, this file must be like a pure library
9 * without including symbols and defines which would otherwise conflict,
10 * when linked once into a module and into a built-in object, at the
11 * same time. For example, __this_module symbol references when that
12 * file is being linked into a built-in object.
14 * Copyright (c) 2018, Intel Corporation.
17 #include <linux/acpi.h>
18 #include <linux/dmi.h>
19 #include <linux/adxl.h>
20 #include <acpi/nfit.h>
22 #include "edac_module.h"
23 #include "skx_common.h"
25 static const char * const component_names
[] = {
26 [INDEX_SOCKET
] = "ProcessorSocketId",
27 [INDEX_MEMCTRL
] = "MemoryControllerId",
28 [INDEX_CHANNEL
] = "ChannelId",
29 [INDEX_DIMM
] = "DimmSlotId",
32 static int component_indices
[ARRAY_SIZE(component_names
)];
33 static int adxl_component_count
;
34 static const char * const *adxl_component_names
;
35 static u64
*adxl_values
;
36 static char *adxl_msg
;
38 static char skx_msg
[MSG_SIZE
];
39 static skx_decode_f skx_decode
;
40 static u64 skx_tolm
, skx_tohm
;
41 static LIST_HEAD(dev_edac_list
);
43 int __init
skx_adxl_get(void)
45 const char * const *names
;
48 names
= adxl_get_component_names();
50 skx_printk(KERN_NOTICE
, "No firmware support for address translation.\n");
54 for (i
= 0; i
< INDEX_MAX
; i
++) {
55 for (j
= 0; names
[j
]; j
++) {
56 if (!strcmp(component_names
[i
], names
[j
])) {
57 component_indices
[i
] = j
;
66 adxl_component_names
= names
;
68 adxl_component_count
++;
70 adxl_values
= kcalloc(adxl_component_count
, sizeof(*adxl_values
),
73 adxl_component_count
= 0;
77 adxl_msg
= kzalloc(MSG_SIZE
, GFP_KERNEL
);
79 adxl_component_count
= 0;
86 skx_printk(KERN_ERR
, "'%s' is not matched from DSM parameters: ",
88 for (j
= 0; names
[j
]; j
++)
89 skx_printk(KERN_CONT
, "%s ", names
[j
]);
90 skx_printk(KERN_CONT
, "\n");
95 void __exit
skx_adxl_put(void)
101 static bool skx_adxl_decode(struct decoded_addr
*res
)
105 if (res
->addr
>= skx_tohm
|| (res
->addr
>= skx_tolm
&&
106 res
->addr
< BIT_ULL(32))) {
107 edac_dbg(0, "Address 0x%llx out of range\n", res
->addr
);
111 if (adxl_decode(res
->addr
, adxl_values
)) {
112 edac_dbg(0, "Failed to decode 0x%llx\n", res
->addr
);
116 res
->socket
= (int)adxl_values
[component_indices
[INDEX_SOCKET
]];
117 res
->imc
= (int)adxl_values
[component_indices
[INDEX_MEMCTRL
]];
118 res
->channel
= (int)adxl_values
[component_indices
[INDEX_CHANNEL
]];
119 res
->dimm
= (int)adxl_values
[component_indices
[INDEX_DIMM
]];
121 for (i
= 0; i
< adxl_component_count
; i
++) {
122 if (adxl_values
[i
] == ~0x0ull
)
125 len
+= snprintf(adxl_msg
+ len
, MSG_SIZE
- len
, " %s:0x%llx",
126 adxl_component_names
[i
], adxl_values
[i
]);
127 if (MSG_SIZE
- len
<= 0)
134 void skx_set_decode(skx_decode_f decode
)
139 int skx_get_src_id(struct skx_dev
*d
, u8
*id
)
143 if (pci_read_config_dword(d
->util_all
, 0xf0, ®
)) {
144 skx_printk(KERN_ERR
, "Failed to read src id\n");
148 *id
= GET_BITFIELD(reg
, 12, 14);
152 int skx_get_node_id(struct skx_dev
*d
, u8
*id
)
156 if (pci_read_config_dword(d
->util_all
, 0xf4, ®
)) {
157 skx_printk(KERN_ERR
, "Failed to read node id\n");
161 *id
= GET_BITFIELD(reg
, 0, 2);
165 static int get_width(u32 mtr
)
167 switch (GET_BITFIELD(mtr
, 8, 9)) {
179 * We use the per-socket device @did to count how many sockets are present,
180 * and to detemine which PCI buses are associated with each socket. Allocate
181 * and build the full list of all the skx_dev structures that we need here.
183 int skx_get_all_bus_mappings(unsigned int did
, int off
, enum type type
,
184 struct list_head
**list
)
186 struct pci_dev
*pdev
, *prev
;
193 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, did
, prev
);
197 d
= kzalloc(sizeof(*d
), GFP_KERNEL
);
203 if (pci_read_config_dword(pdev
, off
, ®
)) {
206 skx_printk(KERN_ERR
, "Failed to read bus idx\n");
210 d
->bus
[0] = GET_BITFIELD(reg
, 0, 7);
211 d
->bus
[1] = GET_BITFIELD(reg
, 8, 15);
213 d
->seg
= pci_domain_nr(pdev
->bus
);
214 d
->bus
[2] = GET_BITFIELD(reg
, 16, 23);
215 d
->bus
[3] = GET_BITFIELD(reg
, 24, 31);
217 d
->seg
= GET_BITFIELD(reg
, 16, 23);
220 edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n",
221 d
->bus
[0], d
->bus
[1], d
->bus
[2], d
->bus
[3]);
222 list_add_tail(&d
->list
, &dev_edac_list
);
227 *list
= &dev_edac_list
;
231 int skx_get_hi_lo(unsigned int did
, int off
[], u64
*tolm
, u64
*tohm
)
233 struct pci_dev
*pdev
;
236 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, did
, NULL
);
238 skx_printk(KERN_ERR
, "Can't get tolm/tohm\n");
242 if (pci_read_config_dword(pdev
, off
[0], ®
)) {
243 skx_printk(KERN_ERR
, "Failed to read tolm\n");
248 if (pci_read_config_dword(pdev
, off
[1], ®
)) {
249 skx_printk(KERN_ERR
, "Failed to read lower tohm\n");
254 if (pci_read_config_dword(pdev
, off
[2], ®
)) {
255 skx_printk(KERN_ERR
, "Failed to read upper tohm\n");
258 skx_tohm
|= (u64
)reg
<< 32;
263 edac_dbg(2, "tolm = 0x%llx tohm = 0x%llx\n", skx_tolm
, skx_tohm
);
270 static int skx_get_dimm_attr(u32 reg
, int lobit
, int hibit
, int add
,
271 int minval
, int maxval
, const char *name
)
273 u32 val
= GET_BITFIELD(reg
, lobit
, hibit
);
275 if (val
< minval
|| val
> maxval
) {
276 edac_dbg(2, "bad %s = %d (raw=0x%x)\n", name
, val
, reg
);
282 #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks")
283 #define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 6, "rows")
284 #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
286 int skx_get_dimm_info(u32 mtr
, u32 amap
, struct dimm_info
*dimm
,
287 struct skx_imc
*imc
, int chan
, int dimmno
)
289 int banks
= 16, ranks
, rows
, cols
, npages
;
292 ranks
= numrank(mtr
);
297 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
299 size
= ((1ull << (rows
+ cols
+ ranks
)) * banks
) >> (20 - 3);
300 npages
= MiB_TO_PAGES(size
);
302 edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld MiB (%d pages) bank: %d, rank: %d, row: 0x%x, col: 0x%x\n",
303 imc
->mc
, chan
, dimmno
, size
, npages
,
304 banks
, 1 << ranks
, rows
, cols
);
306 imc
->chan
[chan
].dimms
[dimmno
].close_pg
= GET_BITFIELD(mtr
, 0, 0);
307 imc
->chan
[chan
].dimms
[dimmno
].bank_xor_enable
= GET_BITFIELD(mtr
, 9, 9);
308 imc
->chan
[chan
].dimms
[dimmno
].fine_grain_bank
= GET_BITFIELD(amap
, 0, 0);
309 imc
->chan
[chan
].dimms
[dimmno
].rowbits
= rows
;
310 imc
->chan
[chan
].dimms
[dimmno
].colbits
= cols
;
312 dimm
->nr_pages
= npages
;
314 dimm
->dtype
= get_width(mtr
);
315 dimm
->mtype
= MEM_DDR4
;
316 dimm
->edac_mode
= EDAC_SECDED
; /* likely better than this */
317 snprintf(dimm
->label
, sizeof(dimm
->label
), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
318 imc
->src_id
, imc
->lmc
, chan
, dimmno
);
323 int skx_get_nvdimm_info(struct dimm_info
*dimm
, struct skx_imc
*imc
,
324 int chan
, int dimmno
, const char *mod_str
)
331 dev_handle
= ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno
, chan
, imc
->lmc
,
334 smbios_handle
= nfit_get_smbios_id(dev_handle
, &flags
);
335 if (smbios_handle
== -EOPNOTSUPP
) {
336 pr_warn_once("%s: Can't find size of NVDIMM. Try enabling CONFIG_ACPI_NFIT\n", mod_str
);
340 if (smbios_handle
< 0) {
341 skx_printk(KERN_ERR
, "Can't find handle for NVDIMM ADR=0x%x\n", dev_handle
);
345 if (flags
& ACPI_NFIT_MEM_MAP_FAILED
) {
346 skx_printk(KERN_ERR
, "NVDIMM ADR=0x%x is not mapped\n", dev_handle
);
350 size
= dmi_memdev_size(smbios_handle
);
352 skx_printk(KERN_ERR
, "Can't find size for NVDIMM ADR=0x%x/SMBIOS=0x%x\n",
353 dev_handle
, smbios_handle
);
356 dimm
->nr_pages
= size
>> PAGE_SHIFT
;
358 dimm
->dtype
= DEV_UNKNOWN
;
359 dimm
->mtype
= MEM_NVDIMM
;
360 dimm
->edac_mode
= EDAC_SECDED
; /* likely better than this */
362 edac_dbg(0, "mc#%d: channel %d, dimm %d, %llu MiB (%u pages)\n",
363 imc
->mc
, chan
, dimmno
, size
>> 20, dimm
->nr_pages
);
365 snprintf(dimm
->label
, sizeof(dimm
->label
), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
366 imc
->src_id
, imc
->lmc
, chan
, dimmno
);
368 return (size
== 0 || size
== ~0ull) ? 0 : 1;
371 int skx_register_mci(struct skx_imc
*imc
, struct pci_dev
*pdev
,
372 const char *ctl_name
, const char *mod_str
,
373 get_dimm_config_f get_dimm_config
)
375 struct mem_ctl_info
*mci
;
376 struct edac_mc_layer layers
[2];
380 /* Allocate a new MC control structure */
381 layers
[0].type
= EDAC_MC_LAYER_CHANNEL
;
382 layers
[0].size
= NUM_CHANNELS
;
383 layers
[0].is_virt_csrow
= false;
384 layers
[1].type
= EDAC_MC_LAYER_SLOT
;
385 layers
[1].size
= NUM_DIMMS
;
386 layers
[1].is_virt_csrow
= true;
387 mci
= edac_mc_alloc(imc
->mc
, ARRAY_SIZE(layers
), layers
,
388 sizeof(struct skx_pvt
));
393 edac_dbg(0, "MC#%d: mci = %p\n", imc
->mc
, mci
);
395 /* Associate skx_dev and mci for future usage */
400 mci
->ctl_name
= kasprintf(GFP_KERNEL
, "%s#%d IMC#%d", ctl_name
,
401 imc
->node_id
, imc
->lmc
);
402 if (!mci
->ctl_name
) {
407 mci
->mtype_cap
= MEM_FLAG_DDR4
| MEM_FLAG_NVDIMM
;
408 mci
->edac_ctl_cap
= EDAC_FLAG_NONE
;
409 mci
->edac_cap
= EDAC_FLAG_NONE
;
410 mci
->mod_name
= mod_str
;
411 mci
->dev_name
= pci_name(pdev
);
412 mci
->ctl_page_to_phys
= NULL
;
414 rc
= get_dimm_config(mci
);
418 /* Record ptr to the generic device */
419 mci
->pdev
= &pdev
->dev
;
421 /* Add this new MC control structure to EDAC's list of MCs */
422 if (unlikely(edac_mc_add_mc(mci
))) {
423 edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
431 kfree(mci
->ctl_name
);
438 static void skx_unregister_mci(struct skx_imc
*imc
)
440 struct mem_ctl_info
*mci
= imc
->mci
;
445 edac_dbg(0, "MC%d: mci = %p\n", imc
->mc
, mci
);
447 /* Remove MC sysfs nodes */
448 edac_mc_del_mc(mci
->pdev
);
450 edac_dbg(1, "%s: free mci struct\n", mci
->ctl_name
);
451 kfree(mci
->ctl_name
);
455 static struct mem_ctl_info
*get_mci(int src_id
, int lmc
)
459 if (lmc
> NUM_IMC
- 1) {
460 skx_printk(KERN_ERR
, "Bad lmc %d\n", lmc
);
464 list_for_each_entry(d
, &dev_edac_list
, list
) {
465 if (d
->imc
[0].src_id
== src_id
)
466 return d
->imc
[lmc
].mci
;
469 skx_printk(KERN_ERR
, "No mci for src_id %d lmc %d\n", src_id
, lmc
);
473 static void skx_mce_output_error(struct mem_ctl_info
*mci
,
475 struct decoded_addr
*res
)
477 enum hw_event_mc_err_type tp_event
;
479 bool ripv
= GET_BITFIELD(m
->mcgstatus
, 0, 0);
480 bool overflow
= GET_BITFIELD(m
->status
, 62, 62);
481 bool uncorrected_error
= GET_BITFIELD(m
->status
, 61, 61);
483 u32 core_err_cnt
= GET_BITFIELD(m
->status
, 38, 52);
484 u32 mscod
= GET_BITFIELD(m
->status
, 16, 31);
485 u32 errcode
= GET_BITFIELD(m
->status
, 0, 15);
486 u32 optypenum
= GET_BITFIELD(m
->status
, 4, 6);
488 recoverable
= GET_BITFIELD(m
->status
, 56, 56);
490 if (uncorrected_error
) {
494 tp_event
= HW_EVENT_ERR_FATAL
;
497 tp_event
= HW_EVENT_ERR_UNCORRECTED
;
501 tp_event
= HW_EVENT_ERR_CORRECTED
;
505 * According to Intel Architecture spec vol 3B,
506 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
507 * memory errors should fit one of these masks:
508 * 000f 0000 1mmm cccc (binary)
509 * 000f 0010 1mmm cccc (binary) [RAM used as cache]
511 * f = Correction Report Filtering Bit. If 1, subsequent errors
515 * If the mask doesn't match, report an error to the parsing logic
517 if (!((errcode
& 0xef80) == 0x80 || (errcode
& 0xef80) == 0x280)) {
518 optype
= "Can't parse: it is not a mem";
522 optype
= "generic undef request error";
525 optype
= "memory read error";
528 optype
= "memory write error";
531 optype
= "addr/cmd error";
534 optype
= "memory scrubbing error";
541 if (adxl_component_count
) {
542 snprintf(skx_msg
, MSG_SIZE
, "%s%s err_code:0x%04x:0x%04x %s",
543 overflow
? " OVERFLOW" : "",
544 (uncorrected_error
&& recoverable
) ? " recoverable" : "",
545 mscod
, errcode
, adxl_msg
);
547 snprintf(skx_msg
, MSG_SIZE
,
548 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
549 overflow
? " OVERFLOW" : "",
550 (uncorrected_error
&& recoverable
) ? " recoverable" : "",
552 res
->socket
, res
->imc
, res
->rank
,
553 res
->bank_group
, res
->bank_address
, res
->row
, res
->column
);
556 edac_dbg(0, "%s\n", skx_msg
);
558 /* Call the helper to output message */
559 edac_mc_handle_error(tp_event
, mci
, core_err_cnt
,
560 m
->addr
>> PAGE_SHIFT
, m
->addr
& ~PAGE_MASK
, 0,
561 res
->channel
, res
->dimm
, -1,
565 int skx_mce_check_error(struct notifier_block
*nb
, unsigned long val
,
568 struct mce
*mce
= (struct mce
*)data
;
569 struct decoded_addr res
;
570 struct mem_ctl_info
*mci
;
573 if (edac_get_report_status() == EDAC_REPORTING_DISABLED
)
576 /* ignore unless this is memory related with an address */
577 if ((mce
->status
& 0xefff) >> 7 != 1 || !(mce
->status
& MCI_STATUS_ADDRV
))
580 memset(&res
, 0, sizeof(res
));
581 res
.addr
= mce
->addr
;
583 if (adxl_component_count
) {
584 if (!skx_adxl_decode(&res
))
587 mci
= get_mci(res
.socket
, res
.imc
);
589 if (!skx_decode
|| !skx_decode(&res
))
592 mci
= res
.dev
->imc
[res
.imc
].mci
;
598 if (mce
->mcgstatus
& MCG_STATUS_MCIP
)
603 skx_mc_printk(mci
, KERN_DEBUG
, "HANDLING MCE MEMORY ERROR\n");
605 skx_mc_printk(mci
, KERN_DEBUG
, "CPU %d: Machine Check %s: 0x%llx "
606 "Bank %d: 0x%llx\n", mce
->extcpu
, type
,
607 mce
->mcgstatus
, mce
->bank
, mce
->status
);
608 skx_mc_printk(mci
, KERN_DEBUG
, "TSC 0x%llx ", mce
->tsc
);
609 skx_mc_printk(mci
, KERN_DEBUG
, "ADDR 0x%llx ", mce
->addr
);
610 skx_mc_printk(mci
, KERN_DEBUG
, "MISC 0x%llx ", mce
->misc
);
612 skx_mc_printk(mci
, KERN_DEBUG
, "PROCESSOR %u:0x%x TIME %llu SOCKET "
613 "%u APIC 0x%x\n", mce
->cpuvendor
, mce
->cpuid
,
614 mce
->time
, mce
->socketid
, mce
->apicid
);
616 skx_mce_output_error(mci
, mce
, &res
);
621 void skx_remove(void)
624 struct skx_dev
*d
, *tmp
;
628 list_for_each_entry_safe(d
, tmp
, &dev_edac_list
, list
) {
630 for (i
= 0; i
< NUM_IMC
; i
++) {
632 skx_unregister_mci(&d
->imc
[i
]);
635 pci_dev_put(d
->imc
[i
].mdev
);
638 iounmap(d
->imc
[i
].mbase
);
640 for (j
= 0; j
< NUM_CHANNELS
; j
++) {
641 if (d
->imc
[i
].chan
[j
].cdev
)
642 pci_dev_put(d
->imc
[i
].chan
[j
].cdev
);
646 pci_dev_put(d
->util_all
);
648 pci_dev_put(d
->sad_all
);
650 pci_dev_put(d
->uracu
);