1 // SPDX-License-Identifier: GPL-2.0
4 * Shared code by both skx_edac and i10nm_edac. Originally split out
5 * from the skx_edac driver.
7 * This file is linked into both skx_edac and i10nm_edac drivers. In
8 * order to avoid link errors, this file must be like a pure library
9 * without including symbols and defines which would otherwise conflict,
10 * when linked once into a module and into a built-in object, at the
11 * same time. For example, __this_module symbol references when that
12 * file is being linked into a built-in object.
14 * Copyright (c) 2018, Intel Corporation.
17 #include <linux/acpi.h>
18 #include <linux/dmi.h>
19 #include <linux/adxl.h>
20 #include <acpi/nfit.h>
22 #include "edac_module.h"
23 #include "skx_common.h"
25 static const char * const component_names
[] = {
26 [INDEX_SOCKET
] = "ProcessorSocketId",
27 [INDEX_MEMCTRL
] = "MemoryControllerId",
28 [INDEX_CHANNEL
] = "ChannelId",
29 [INDEX_DIMM
] = "DimmSlotId",
32 static int component_indices
[ARRAY_SIZE(component_names
)];
33 static int adxl_component_count
;
34 static const char * const *adxl_component_names
;
35 static u64
*adxl_values
;
36 static char *adxl_msg
;
38 static char skx_msg
[MSG_SIZE
];
39 static skx_decode_f skx_decode
;
40 static skx_show_retry_log_f skx_show_retry_rd_err_log
;
41 static u64 skx_tolm
, skx_tohm
;
42 static LIST_HEAD(dev_edac_list
);
44 int __init
skx_adxl_get(void)
46 const char * const *names
;
49 names
= adxl_get_component_names();
51 skx_printk(KERN_NOTICE
, "No firmware support for address translation.\n");
55 for (i
= 0; i
< INDEX_MAX
; i
++) {
56 for (j
= 0; names
[j
]; j
++) {
57 if (!strcmp(component_names
[i
], names
[j
])) {
58 component_indices
[i
] = j
;
67 adxl_component_names
= names
;
69 adxl_component_count
++;
71 adxl_values
= kcalloc(adxl_component_count
, sizeof(*adxl_values
),
74 adxl_component_count
= 0;
78 adxl_msg
= kzalloc(MSG_SIZE
, GFP_KERNEL
);
80 adxl_component_count
= 0;
87 skx_printk(KERN_ERR
, "'%s' is not matched from DSM parameters: ",
89 for (j
= 0; names
[j
]; j
++)
90 skx_printk(KERN_CONT
, "%s ", names
[j
]);
91 skx_printk(KERN_CONT
, "\n");
96 void __exit
skx_adxl_put(void)
102 static bool skx_adxl_decode(struct decoded_addr
*res
)
107 if (res
->addr
>= skx_tohm
|| (res
->addr
>= skx_tolm
&&
108 res
->addr
< BIT_ULL(32))) {
109 edac_dbg(0, "Address 0x%llx out of range\n", res
->addr
);
113 if (adxl_decode(res
->addr
, adxl_values
)) {
114 edac_dbg(0, "Failed to decode 0x%llx\n", res
->addr
);
118 res
->socket
= (int)adxl_values
[component_indices
[INDEX_SOCKET
]];
119 res
->imc
= (int)adxl_values
[component_indices
[INDEX_MEMCTRL
]];
120 res
->channel
= (int)adxl_values
[component_indices
[INDEX_CHANNEL
]];
121 res
->dimm
= (int)adxl_values
[component_indices
[INDEX_DIMM
]];
123 if (res
->imc
> NUM_IMC
- 1) {
124 skx_printk(KERN_ERR
, "Bad imc %d\n", res
->imc
);
128 list_for_each_entry(d
, &dev_edac_list
, list
) {
129 if (d
->imc
[0].src_id
== res
->socket
) {
136 skx_printk(KERN_ERR
, "No device for src_id %d imc %d\n",
137 res
->socket
, res
->imc
);
141 for (i
= 0; i
< adxl_component_count
; i
++) {
142 if (adxl_values
[i
] == ~0x0ull
)
145 len
+= snprintf(adxl_msg
+ len
, MSG_SIZE
- len
, " %s:0x%llx",
146 adxl_component_names
[i
], adxl_values
[i
]);
147 if (MSG_SIZE
- len
<= 0)
154 void skx_set_decode(skx_decode_f decode
, skx_show_retry_log_f show_retry_log
)
157 skx_show_retry_rd_err_log
= show_retry_log
;
160 int skx_get_src_id(struct skx_dev
*d
, int off
, u8
*id
)
164 if (pci_read_config_dword(d
->util_all
, off
, ®
)) {
165 skx_printk(KERN_ERR
, "Failed to read src id\n");
169 *id
= GET_BITFIELD(reg
, 12, 14);
173 int skx_get_node_id(struct skx_dev
*d
, u8
*id
)
177 if (pci_read_config_dword(d
->util_all
, 0xf4, ®
)) {
178 skx_printk(KERN_ERR
, "Failed to read node id\n");
182 *id
= GET_BITFIELD(reg
, 0, 2);
186 static int get_width(u32 mtr
)
188 switch (GET_BITFIELD(mtr
, 8, 9)) {
200 * We use the per-socket device @did to count how many sockets are present,
201 * and to detemine which PCI buses are associated with each socket. Allocate
202 * and build the full list of all the skx_dev structures that we need here.
204 int skx_get_all_bus_mappings(unsigned int did
, int off
, enum type type
,
205 struct list_head
**list
)
207 struct pci_dev
*pdev
, *prev
;
214 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, did
, prev
);
218 d
= kzalloc(sizeof(*d
), GFP_KERNEL
);
224 if (pci_read_config_dword(pdev
, off
, ®
)) {
227 skx_printk(KERN_ERR
, "Failed to read bus idx\n");
231 d
->bus
[0] = GET_BITFIELD(reg
, 0, 7);
232 d
->bus
[1] = GET_BITFIELD(reg
, 8, 15);
234 d
->seg
= pci_domain_nr(pdev
->bus
);
235 d
->bus
[2] = GET_BITFIELD(reg
, 16, 23);
236 d
->bus
[3] = GET_BITFIELD(reg
, 24, 31);
238 d
->seg
= GET_BITFIELD(reg
, 16, 23);
241 edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n",
242 d
->bus
[0], d
->bus
[1], d
->bus
[2], d
->bus
[3]);
243 list_add_tail(&d
->list
, &dev_edac_list
);
248 *list
= &dev_edac_list
;
252 int skx_get_hi_lo(unsigned int did
, int off
[], u64
*tolm
, u64
*tohm
)
254 struct pci_dev
*pdev
;
257 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, did
, NULL
);
259 skx_printk(KERN_ERR
, "Can't get tolm/tohm\n");
263 if (pci_read_config_dword(pdev
, off
[0], ®
)) {
264 skx_printk(KERN_ERR
, "Failed to read tolm\n");
269 if (pci_read_config_dword(pdev
, off
[1], ®
)) {
270 skx_printk(KERN_ERR
, "Failed to read lower tohm\n");
275 if (pci_read_config_dword(pdev
, off
[2], ®
)) {
276 skx_printk(KERN_ERR
, "Failed to read upper tohm\n");
279 skx_tohm
|= (u64
)reg
<< 32;
284 edac_dbg(2, "tolm = 0x%llx tohm = 0x%llx\n", skx_tolm
, skx_tohm
);
291 static int skx_get_dimm_attr(u32 reg
, int lobit
, int hibit
, int add
,
292 int minval
, int maxval
, const char *name
)
294 u32 val
= GET_BITFIELD(reg
, lobit
, hibit
);
296 if (val
< minval
|| val
> maxval
) {
297 edac_dbg(2, "bad %s = %d (raw=0x%x)\n", name
, val
, reg
);
303 #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks")
304 #define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 6, "rows")
305 #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols")
307 int skx_get_dimm_info(u32 mtr
, u32 amap
, struct dimm_info
*dimm
,
308 struct skx_imc
*imc
, int chan
, int dimmno
)
310 int banks
= 16, ranks
, rows
, cols
, npages
;
313 ranks
= numrank(mtr
);
318 * Compute size in 8-byte (2^3) words, then shift to MiB (2^20)
320 size
= ((1ull << (rows
+ cols
+ ranks
)) * banks
) >> (20 - 3);
321 npages
= MiB_TO_PAGES(size
);
323 edac_dbg(0, "mc#%d: channel %d, dimm %d, %lld MiB (%d pages) bank: %d, rank: %d, row: 0x%x, col: 0x%x\n",
324 imc
->mc
, chan
, dimmno
, size
, npages
,
325 banks
, 1 << ranks
, rows
, cols
);
327 imc
->chan
[chan
].dimms
[dimmno
].close_pg
= GET_BITFIELD(mtr
, 0, 0);
328 imc
->chan
[chan
].dimms
[dimmno
].bank_xor_enable
= GET_BITFIELD(mtr
, 9, 9);
329 imc
->chan
[chan
].dimms
[dimmno
].fine_grain_bank
= GET_BITFIELD(amap
, 0, 0);
330 imc
->chan
[chan
].dimms
[dimmno
].rowbits
= rows
;
331 imc
->chan
[chan
].dimms
[dimmno
].colbits
= cols
;
333 dimm
->nr_pages
= npages
;
335 dimm
->dtype
= get_width(mtr
);
336 dimm
->mtype
= MEM_DDR4
;
337 dimm
->edac_mode
= EDAC_SECDED
; /* likely better than this */
338 snprintf(dimm
->label
, sizeof(dimm
->label
), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
339 imc
->src_id
, imc
->lmc
, chan
, dimmno
);
344 int skx_get_nvdimm_info(struct dimm_info
*dimm
, struct skx_imc
*imc
,
345 int chan
, int dimmno
, const char *mod_str
)
352 dev_handle
= ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno
, chan
, imc
->lmc
,
355 smbios_handle
= nfit_get_smbios_id(dev_handle
, &flags
);
356 if (smbios_handle
== -EOPNOTSUPP
) {
357 pr_warn_once("%s: Can't find size of NVDIMM. Try enabling CONFIG_ACPI_NFIT\n", mod_str
);
361 if (smbios_handle
< 0) {
362 skx_printk(KERN_ERR
, "Can't find handle for NVDIMM ADR=0x%x\n", dev_handle
);
366 if (flags
& ACPI_NFIT_MEM_MAP_FAILED
) {
367 skx_printk(KERN_ERR
, "NVDIMM ADR=0x%x is not mapped\n", dev_handle
);
371 size
= dmi_memdev_size(smbios_handle
);
373 skx_printk(KERN_ERR
, "Can't find size for NVDIMM ADR=0x%x/SMBIOS=0x%x\n",
374 dev_handle
, smbios_handle
);
377 dimm
->nr_pages
= size
>> PAGE_SHIFT
;
379 dimm
->dtype
= DEV_UNKNOWN
;
380 dimm
->mtype
= MEM_NVDIMM
;
381 dimm
->edac_mode
= EDAC_SECDED
; /* likely better than this */
383 edac_dbg(0, "mc#%d: channel %d, dimm %d, %llu MiB (%u pages)\n",
384 imc
->mc
, chan
, dimmno
, size
>> 20, dimm
->nr_pages
);
386 snprintf(dimm
->label
, sizeof(dimm
->label
), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
387 imc
->src_id
, imc
->lmc
, chan
, dimmno
);
389 return (size
== 0 || size
== ~0ull) ? 0 : 1;
392 int skx_register_mci(struct skx_imc
*imc
, struct pci_dev
*pdev
,
393 const char *ctl_name
, const char *mod_str
,
394 get_dimm_config_f get_dimm_config
)
396 struct mem_ctl_info
*mci
;
397 struct edac_mc_layer layers
[2];
401 /* Allocate a new MC control structure */
402 layers
[0].type
= EDAC_MC_LAYER_CHANNEL
;
403 layers
[0].size
= NUM_CHANNELS
;
404 layers
[0].is_virt_csrow
= false;
405 layers
[1].type
= EDAC_MC_LAYER_SLOT
;
406 layers
[1].size
= NUM_DIMMS
;
407 layers
[1].is_virt_csrow
= true;
408 mci
= edac_mc_alloc(imc
->mc
, ARRAY_SIZE(layers
), layers
,
409 sizeof(struct skx_pvt
));
414 edac_dbg(0, "MC#%d: mci = %p\n", imc
->mc
, mci
);
416 /* Associate skx_dev and mci for future usage */
421 mci
->ctl_name
= kasprintf(GFP_KERNEL
, "%s#%d IMC#%d", ctl_name
,
422 imc
->node_id
, imc
->lmc
);
423 if (!mci
->ctl_name
) {
428 mci
->mtype_cap
= MEM_FLAG_DDR4
| MEM_FLAG_NVDIMM
;
429 mci
->edac_ctl_cap
= EDAC_FLAG_NONE
;
430 mci
->edac_cap
= EDAC_FLAG_NONE
;
431 mci
->mod_name
= mod_str
;
432 mci
->dev_name
= pci_name(pdev
);
433 mci
->ctl_page_to_phys
= NULL
;
435 rc
= get_dimm_config(mci
);
439 /* Record ptr to the generic device */
440 mci
->pdev
= &pdev
->dev
;
442 /* Add this new MC control structure to EDAC's list of MCs */
443 if (unlikely(edac_mc_add_mc(mci
))) {
444 edac_dbg(0, "MC: failed edac_mc_add_mc()\n");
452 kfree(mci
->ctl_name
);
459 static void skx_unregister_mci(struct skx_imc
*imc
)
461 struct mem_ctl_info
*mci
= imc
->mci
;
466 edac_dbg(0, "MC%d: mci = %p\n", imc
->mc
, mci
);
468 /* Remove MC sysfs nodes */
469 edac_mc_del_mc(mci
->pdev
);
471 edac_dbg(1, "%s: free mci struct\n", mci
->ctl_name
);
472 kfree(mci
->ctl_name
);
476 static void skx_mce_output_error(struct mem_ctl_info
*mci
,
478 struct decoded_addr
*res
)
480 enum hw_event_mc_err_type tp_event
;
482 bool ripv
= GET_BITFIELD(m
->mcgstatus
, 0, 0);
483 bool overflow
= GET_BITFIELD(m
->status
, 62, 62);
484 bool uncorrected_error
= GET_BITFIELD(m
->status
, 61, 61);
487 u32 core_err_cnt
= GET_BITFIELD(m
->status
, 38, 52);
488 u32 mscod
= GET_BITFIELD(m
->status
, 16, 31);
489 u32 errcode
= GET_BITFIELD(m
->status
, 0, 15);
490 u32 optypenum
= GET_BITFIELD(m
->status
, 4, 6);
492 recoverable
= GET_BITFIELD(m
->status
, 56, 56);
494 if (uncorrected_error
) {
497 tp_event
= HW_EVENT_ERR_FATAL
;
499 tp_event
= HW_EVENT_ERR_UNCORRECTED
;
502 tp_event
= HW_EVENT_ERR_CORRECTED
;
506 * According to Intel Architecture spec vol 3B,
507 * Table 15-10 "IA32_MCi_Status [15:0] Compound Error Code Encoding"
508 * memory errors should fit one of these masks:
509 * 000f 0000 1mmm cccc (binary)
510 * 000f 0010 1mmm cccc (binary) [RAM used as cache]
512 * f = Correction Report Filtering Bit. If 1, subsequent errors
516 * If the mask doesn't match, report an error to the parsing logic
518 if (!((errcode
& 0xef80) == 0x80 || (errcode
& 0xef80) == 0x280)) {
519 optype
= "Can't parse: it is not a mem";
523 optype
= "generic undef request error";
526 optype
= "memory read error";
529 optype
= "memory write error";
532 optype
= "addr/cmd error";
535 optype
= "memory scrubbing error";
542 if (adxl_component_count
) {
543 len
= snprintf(skx_msg
, MSG_SIZE
, "%s%s err_code:0x%04x:0x%04x %s",
544 overflow
? " OVERFLOW" : "",
545 (uncorrected_error
&& recoverable
) ? " recoverable" : "",
546 mscod
, errcode
, adxl_msg
);
548 len
= snprintf(skx_msg
, MSG_SIZE
,
549 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
550 overflow
? " OVERFLOW" : "",
551 (uncorrected_error
&& recoverable
) ? " recoverable" : "",
553 res
->socket
, res
->imc
, res
->rank
,
554 res
->bank_group
, res
->bank_address
, res
->row
, res
->column
);
557 if (skx_show_retry_rd_err_log
)
558 skx_show_retry_rd_err_log(res
, skx_msg
+ len
, MSG_SIZE
- len
);
560 edac_dbg(0, "%s\n", skx_msg
);
562 /* Call the helper to output message */
563 edac_mc_handle_error(tp_event
, mci
, core_err_cnt
,
564 m
->addr
>> PAGE_SHIFT
, m
->addr
& ~PAGE_MASK
, 0,
565 res
->channel
, res
->dimm
, -1,
569 int skx_mce_check_error(struct notifier_block
*nb
, unsigned long val
,
572 struct mce
*mce
= (struct mce
*)data
;
573 struct decoded_addr res
;
574 struct mem_ctl_info
*mci
;
577 if (edac_get_report_status() == EDAC_REPORTING_DISABLED
)
580 /* ignore unless this is memory related with an address */
581 if ((mce
->status
& 0xefff) >> 7 != 1 || !(mce
->status
& MCI_STATUS_ADDRV
))
584 memset(&res
, 0, sizeof(res
));
585 res
.addr
= mce
->addr
;
587 if (adxl_component_count
) {
588 if (!skx_adxl_decode(&res
))
590 } else if (!skx_decode
|| !skx_decode(&res
)) {
594 mci
= res
.dev
->imc
[res
.imc
].mci
;
599 if (mce
->mcgstatus
& MCG_STATUS_MCIP
)
604 skx_mc_printk(mci
, KERN_DEBUG
, "HANDLING MCE MEMORY ERROR\n");
606 skx_mc_printk(mci
, KERN_DEBUG
, "CPU %d: Machine Check %s: 0x%llx "
607 "Bank %d: 0x%llx\n", mce
->extcpu
, type
,
608 mce
->mcgstatus
, mce
->bank
, mce
->status
);
609 skx_mc_printk(mci
, KERN_DEBUG
, "TSC 0x%llx ", mce
->tsc
);
610 skx_mc_printk(mci
, KERN_DEBUG
, "ADDR 0x%llx ", mce
->addr
);
611 skx_mc_printk(mci
, KERN_DEBUG
, "MISC 0x%llx ", mce
->misc
);
613 skx_mc_printk(mci
, KERN_DEBUG
, "PROCESSOR %u:0x%x TIME %llu SOCKET "
614 "%u APIC 0x%x\n", mce
->cpuvendor
, mce
->cpuid
,
615 mce
->time
, mce
->socketid
, mce
->apicid
);
617 skx_mce_output_error(mci
, mce
, &res
);
622 void skx_remove(void)
625 struct skx_dev
*d
, *tmp
;
629 list_for_each_entry_safe(d
, tmp
, &dev_edac_list
, list
) {
631 for (i
= 0; i
< NUM_IMC
; i
++) {
633 skx_unregister_mci(&d
->imc
[i
]);
636 pci_dev_put(d
->imc
[i
].mdev
);
639 iounmap(d
->imc
[i
].mbase
);
641 for (j
= 0; j
< NUM_CHANNELS
; j
++) {
642 if (d
->imc
[i
].chan
[j
].cdev
)
643 pci_dev_put(d
->imc
[i
].chan
[j
].cdev
);
647 pci_dev_put(d
->util_all
);
649 pci_dev_put(d
->sad_all
);
651 pci_dev_put(d
->uracu
);