1 // SPDX-License-Identifier: GPL-2.0-only
3 * GHES/EDAC Linux driver
5 * Copyright (c) 2013 by Mauro Carvalho Chehab
7 * Red Hat Inc. https://www.redhat.com
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 #include <acpi/ghes.h>
13 #include <linux/edac.h>
14 #include <linux/dmi.h>
15 #include "edac_module.h"
16 #include <ras/ras_event.h>
17 #include <linux/notifier.h>
19 #define OTHER_DETAIL_LEN 400
22 struct mem_ctl_info
*mci
;
24 /* Buffers for the error handling routine */
25 char other_detail
[OTHER_DETAIL_LEN
];
29 static refcount_t ghes_refcount
= REFCOUNT_INIT(0);
32 * Access to ghes_pvt must be protected by ghes_lock. The spinlock
33 * also provides the necessary (implicit) memory barrier for the SMP
34 * case to make the pointer visible on another CPU.
36 static struct ghes_pvt
*ghes_pvt
;
39 * This driver's representation of the system hardware, as collected
42 static struct ghes_hw_desc
{
44 struct dimm_info
*dimms
;
47 /* GHES registration mutex */
48 static DEFINE_MUTEX(ghes_reg_mutex
);
51 * Sync with other, potentially concurrent callers of
52 * ghes_edac_report_mem_error(). We don't know what the
53 * "inventive" firmware would do.
55 static DEFINE_SPINLOCK(ghes_lock
);
57 static bool system_scanned
;
59 static struct list_head
*ghes_devs
;
61 /* Memory Device - Type 17 of SMBIOS spec */
62 struct memdev_dmi_entry
{
66 u16 phys_mem_array_handle
;
67 u16 mem_err_info_handle
;
84 u16 conf_mem_clk_speed
;
85 } __attribute__((__packed__
));
87 static struct dimm_info
*find_dimm_by_handle(struct mem_ctl_info
*mci
, u16 handle
)
89 struct dimm_info
*dimm
;
91 mci_for_each_dimm(mci
, dimm
) {
92 if (dimm
->smbios_handle
== handle
)
99 static void dimm_setup_label(struct dimm_info
*dimm
, u16 handle
)
101 const char *bank
= NULL
, *device
= NULL
;
103 dmi_memdev_name(handle
, &bank
, &device
);
106 * Set to a NULL string when both bank and device are zero. In this case,
107 * the label assigned by default will be preserved.
109 snprintf(dimm
->label
, sizeof(dimm
->label
), "%s%s%s",
110 (bank
&& *bank
) ? bank
: "",
111 (bank
&& *bank
&& device
&& *device
) ? " " : "",
112 (device
&& *device
) ? device
: "");
115 static void assign_dmi_dimm_info(struct dimm_info
*dimm
, struct memdev_dmi_entry
*entry
)
117 u16 rdr_mask
= BIT(7) | BIT(13);
119 if (entry
->size
== 0xffff) {
120 pr_info("Can't get DIMM%i size\n", dimm
->idx
);
121 dimm
->nr_pages
= MiB_TO_PAGES(32);/* Unknown */
122 } else if (entry
->size
== 0x7fff) {
123 dimm
->nr_pages
= MiB_TO_PAGES(entry
->extended_size
);
125 if (entry
->size
& BIT(15))
126 dimm
->nr_pages
= MiB_TO_PAGES((entry
->size
& 0x7fff) << 10);
128 dimm
->nr_pages
= MiB_TO_PAGES(entry
->size
);
131 switch (entry
->memory_type
) {
133 if (entry
->type_detail
& BIT(13))
134 dimm
->mtype
= MEM_RDDR
;
136 dimm
->mtype
= MEM_DDR
;
139 if (entry
->type_detail
& BIT(13))
140 dimm
->mtype
= MEM_RDDR2
;
142 dimm
->mtype
= MEM_DDR2
;
145 dimm
->mtype
= MEM_FB_DDR2
;
148 if (entry
->type_detail
& BIT(12))
149 dimm
->mtype
= MEM_NVDIMM
;
150 else if (entry
->type_detail
& BIT(13))
151 dimm
->mtype
= MEM_RDDR3
;
153 dimm
->mtype
= MEM_DDR3
;
156 if (entry
->type_detail
& BIT(12))
157 dimm
->mtype
= MEM_NVDIMM
;
158 else if (entry
->type_detail
& BIT(13))
159 dimm
->mtype
= MEM_RDDR4
;
161 dimm
->mtype
= MEM_DDR4
;
164 if (entry
->type_detail
& BIT(6))
165 dimm
->mtype
= MEM_RMBS
;
166 else if ((entry
->type_detail
& rdr_mask
) == rdr_mask
)
167 dimm
->mtype
= MEM_RDR
;
168 else if (entry
->type_detail
& BIT(7))
169 dimm
->mtype
= MEM_SDR
;
170 else if (entry
->type_detail
& BIT(9))
171 dimm
->mtype
= MEM_EDO
;
173 dimm
->mtype
= MEM_UNKNOWN
;
177 * Actually, we can only detect if the memory has bits for
180 if (entry
->total_width
== entry
->data_width
)
181 dimm
->edac_mode
= EDAC_NONE
;
183 dimm
->edac_mode
= EDAC_SECDED
;
185 dimm
->dtype
= DEV_UNKNOWN
;
186 dimm
->grain
= 128; /* Likely, worse case */
188 dimm_setup_label(dimm
, entry
->handle
);
190 if (dimm
->nr_pages
) {
191 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
192 dimm
->idx
, edac_mem_types
[dimm
->mtype
],
193 PAGES_TO_MiB(dimm
->nr_pages
),
194 (dimm
->edac_mode
!= EDAC_NONE
) ? "(ECC)" : "");
195 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
196 entry
->memory_type
, entry
->type_detail
,
197 entry
->total_width
, entry
->data_width
);
200 dimm
->smbios_handle
= entry
->handle
;
203 static void enumerate_dimms(const struct dmi_header
*dh
, void *arg
)
205 struct memdev_dmi_entry
*entry
= (struct memdev_dmi_entry
*)dh
;
206 struct ghes_hw_desc
*hw
= (struct ghes_hw_desc
*)arg
;
209 if (dh
->type
!= DMI_ENTRY_MEM_DEVICE
)
212 /* Enlarge the array with additional 16 */
213 if (!hw
->num_dimms
|| !(hw
->num_dimms
% 16)) {
214 struct dimm_info
*new;
216 new = krealloc_array(hw
->dimms
, hw
->num_dimms
+ 16,
217 sizeof(struct dimm_info
), GFP_KERNEL
);
226 d
= &hw
->dimms
[hw
->num_dimms
];
227 d
->idx
= hw
->num_dimms
;
229 assign_dmi_dimm_info(d
, entry
);
234 static void ghes_scan_system(void)
239 dmi_walk(enumerate_dimms
, &ghes_hw
);
241 system_scanned
= true;
244 static int print_mem_error_other_detail(const struct cper_sec_mem_err
*mem
, char *msg
,
245 const char *location
, unsigned int len
)
255 n
+= scnprintf(msg
+ n
, len
- n
, "APEI location: %s ", location
);
257 if (!(mem
->validation_bits
& CPER_MEM_VALID_ERROR_STATUS
))
260 n
+= scnprintf(msg
+ n
, len
- n
, "status(0x%016llx): ", mem
->error_status
);
261 n
+= scnprintf(msg
+ n
, len
- n
, "%s ", cper_mem_err_status_str(mem
->error_status
));
269 static int ghes_edac_report_mem_error(struct notifier_block
*nb
,
270 unsigned long val
, void *data
)
272 struct cper_sec_mem_err
*mem_err
= (struct cper_sec_mem_err
*)data
;
273 struct cper_mem_err_compact cmem
;
274 struct edac_raw_error_desc
*e
;
275 struct mem_ctl_info
*mci
;
276 unsigned long sev
= val
;
277 struct ghes_pvt
*pvt
;
282 * We can do the locking below because GHES defers error processing
283 * from NMI to IRQ context. Whenever that changes, we'd at least
286 if (WARN_ON_ONCE(in_nmi()))
289 spin_lock_irqsave(&ghes_lock
, flags
);
296 e
= &mci
->error_desc
;
298 /* Cleans the error report buffer */
299 memset(e
, 0, sizeof (*e
));
303 e
->other_detail
= pvt
->other_detail
;
307 *pvt
->other_detail
= '\0';
311 case GHES_SEV_CORRECTED
:
312 e
->type
= HW_EVENT_ERR_CORRECTED
;
314 case GHES_SEV_RECOVERABLE
:
315 e
->type
= HW_EVENT_ERR_UNCORRECTED
;
318 e
->type
= HW_EVENT_ERR_FATAL
;
322 e
->type
= HW_EVENT_ERR_INFO
;
325 edac_dbg(1, "error validation_bits: 0x%08llx\n",
326 (long long)mem_err
->validation_bits
);
328 /* Error type, mapped on e->msg */
329 if (mem_err
->validation_bits
& CPER_MEM_VALID_ERROR_TYPE
) {
330 u8 etype
= mem_err
->error_type
;
333 p
+= snprintf(p
, sizeof(pvt
->msg
), "%s", cper_mem_err_type_str(etype
));
335 strcpy(pvt
->msg
, "unknown error");
339 if (mem_err
->validation_bits
& CPER_MEM_VALID_PA
) {
340 e
->page_frame_number
= PHYS_PFN(mem_err
->physical_addr
);
341 e
->offset_in_page
= offset_in_page(mem_err
->physical_addr
);
345 if (mem_err
->validation_bits
& CPER_MEM_VALID_PA_MASK
)
346 e
->grain
= ~mem_err
->physical_addr_mask
+ 1;
348 /* Memory error location, mapped on e->location */
350 cper_mem_err_pack(mem_err
, &cmem
);
351 p
+= cper_mem_err_location(&cmem
, p
);
353 if (mem_err
->validation_bits
& CPER_MEM_VALID_MODULE_HANDLE
) {
354 struct dimm_info
*dimm
;
356 p
+= cper_dimm_err_location(&cmem
, p
);
357 dimm
= find_dimm_by_handle(mci
, mem_err
->mem_dev_handle
);
359 e
->top_layer
= dimm
->idx
;
360 strcpy(e
->label
, dimm
->label
);
367 strcpy(e
->label
, "unknown memory");
369 /* All other fields are mapped on e->other_detail */
370 p
= pvt
->other_detail
;
371 p
+= print_mem_error_other_detail(mem_err
, p
, e
->location
, OTHER_DETAIL_LEN
);
372 if (p
> pvt
->other_detail
)
375 edac_raw_mc_handle_error(e
);
378 spin_unlock_irqrestore(&ghes_lock
, flags
);
383 static struct notifier_block ghes_edac_mem_err_nb
= {
384 .notifier_call
= ghes_edac_report_mem_error
,
388 static int ghes_edac_register(struct device
*dev
)
391 struct mem_ctl_info
*mci
;
392 struct ghes_pvt
*pvt
;
393 struct edac_mc_layer layers
[1];
397 /* finish another registration/unregistration instance first */
398 mutex_lock(&ghes_reg_mutex
);
401 * We have only one logical memory controller to which all DIMMs belong.
403 if (refcount_inc_not_zero(&ghes_refcount
))
408 /* Check if we've got a bogus BIOS */
409 if (!ghes_hw
.num_dimms
) {
411 ghes_hw
.num_dimms
= 1;
414 layers
[0].type
= EDAC_MC_LAYER_ALL_MEM
;
415 layers
[0].size
= ghes_hw
.num_dimms
;
416 layers
[0].is_virt_csrow
= true;
418 mci
= edac_mc_alloc(0, ARRAY_SIZE(layers
), layers
, sizeof(struct ghes_pvt
));
420 pr_info("Can't allocate memory for EDAC data\n");
429 mci
->mtype_cap
= MEM_FLAG_EMPTY
;
430 mci
->edac_ctl_cap
= EDAC_FLAG_NONE
;
431 mci
->edac_cap
= EDAC_FLAG_NONE
;
432 mci
->mod_name
= "ghes_edac.c";
433 mci
->ctl_name
= "ghes_edac";
434 mci
->dev_name
= "ghes";
437 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
438 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
439 pr_info("work on such system. Use this driver with caution\n");
442 pr_info("This system has %d DIMM sockets.\n", ghes_hw
.num_dimms
);
445 struct dimm_info
*src
, *dst
;
448 mci_for_each_dimm(mci
, dst
) {
449 src
= &ghes_hw
.dimms
[i
];
452 dst
->smbios_handle
= src
->smbios_handle
;
453 dst
->nr_pages
= src
->nr_pages
;
454 dst
->mtype
= src
->mtype
;
455 dst
->edac_mode
= src
->edac_mode
;
456 dst
->dtype
= src
->dtype
;
457 dst
->grain
= src
->grain
;
460 * If no src->label, preserve default label assigned
463 if (strlen(src
->label
))
464 memcpy(dst
->label
, src
->label
, sizeof(src
->label
));
470 struct dimm_info
*dimm
= edac_get_dimm(mci
, 0, 0, 0);
474 dimm
->mtype
= MEM_UNKNOWN
;
475 dimm
->dtype
= DEV_UNKNOWN
;
476 dimm
->edac_mode
= EDAC_SECDED
;
479 rc
= edac_mc_add_mc(mci
);
481 pr_info("Can't register with the EDAC core\n");
487 spin_lock_irqsave(&ghes_lock
, flags
);
489 spin_unlock_irqrestore(&ghes_lock
, flags
);
491 ghes_register_report_chain(&ghes_edac_mem_err_nb
);
493 /* only set on success */
494 refcount_set(&ghes_refcount
, 1);
498 /* Not needed anymore */
499 kfree(ghes_hw
.dimms
);
500 ghes_hw
.dimms
= NULL
;
502 mutex_unlock(&ghes_reg_mutex
);
507 static void ghes_edac_unregister(struct ghes
*ghes
)
509 struct mem_ctl_info
*mci
;
512 mutex_lock(&ghes_reg_mutex
);
514 system_scanned
= false;
515 memset(&ghes_hw
, 0, sizeof(struct ghes_hw_desc
));
517 if (!refcount_dec_and_test(&ghes_refcount
))
521 * Wait for the irq handler being finished.
523 spin_lock_irqsave(&ghes_lock
, flags
);
524 mci
= ghes_pvt
? ghes_pvt
->mci
: NULL
;
526 spin_unlock_irqrestore(&ghes_lock
, flags
);
531 mci
= edac_mc_del_mc(mci
->pdev
);
535 ghes_unregister_report_chain(&ghes_edac_mem_err_nb
);
538 mutex_unlock(&ghes_reg_mutex
);
541 static int __init
ghes_edac_init(void)
543 struct ghes
*g
, *g_tmp
;
545 ghes_devs
= ghes_get_devices();
549 if (list_empty(ghes_devs
)) {
550 pr_info("GHES probing device list is empty\n");
554 list_for_each_entry_safe(g
, g_tmp
, ghes_devs
, elist
) {
555 ghes_edac_register(g
->dev
);
560 module_init(ghes_edac_init
);
562 static void __exit
ghes_edac_exit(void)
564 struct ghes
*g
, *g_tmp
;
566 list_for_each_entry_safe(g
, g_tmp
, ghes_devs
, elist
) {
567 ghes_edac_unregister(g
);
570 module_exit(ghes_edac_exit
);
572 MODULE_LICENSE("GPL");
573 MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC");