2 * drivers/pci/pcie/aer/aerdrv_core.c
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
8 * This file implements the core part of PCI-Express AER. When an pci-express
9 * error is delivered, an error message will be collected and printed to
10 * console, then, an error recovery procedure will be executed by following
11 * the pci error recovery rules.
13 * Copyright (C) 2006 Intel Corp.
14 * Tom Long Nguyen (tom.l.nguyen@intel.com)
15 * Zhang Yanmin (yanmin.zhang@intel.com)
19 #include <linux/module.h>
20 #include <linux/pci.h>
21 #include <linux/kernel.h>
22 #include <linux/errno.h>
24 #include <linux/suspend.h>
25 #include <linux/delay.h>
26 #include <linux/slab.h>
30 static int nosourceid
;
31 module_param(forceload
, bool, 0);
32 module_param(nosourceid
, bool, 0);
34 int pci_enable_pcie_error_reporting(struct pci_dev
*dev
)
39 if (pcie_aer_get_firmware_first(dev
))
42 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
46 pos
= pci_pcie_cap(dev
);
50 pci_read_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, ®16
);
51 reg16
|= (PCI_EXP_DEVCTL_CERE
|
52 PCI_EXP_DEVCTL_NFERE
|
55 pci_write_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, reg16
);
59 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting
);
61 int pci_disable_pcie_error_reporting(struct pci_dev
*dev
)
66 if (pcie_aer_get_firmware_first(dev
))
69 pos
= pci_pcie_cap(dev
);
73 pci_read_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, ®16
);
74 reg16
&= ~(PCI_EXP_DEVCTL_CERE
|
75 PCI_EXP_DEVCTL_NFERE
|
78 pci_write_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, reg16
);
82 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting
);
84 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev
*dev
)
89 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
93 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
95 pci_write_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, status
);
99 EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status
);
102 * add_error_device - list device to be handled
103 * @e_info: pointer to error info
104 * @dev: pointer to pci_dev to be added
106 static int add_error_device(struct aer_err_info
*e_info
, struct pci_dev
*dev
)
108 if (e_info
->error_dev_num
< AER_MAX_MULTI_ERR_DEVICES
) {
109 e_info
->dev
[e_info
->error_dev_num
] = dev
;
110 e_info
->error_dev_num
++;
116 #define PCI_BUS(x) (((x) >> 8) & 0xff)
119 * is_error_source - check whether the device is source of reported error
120 * @dev: pointer to pci_dev to be checked
121 * @e_info: pointer to reported error info
123 static bool is_error_source(struct pci_dev
*dev
, struct aer_err_info
*e_info
)
130 * When bus id is equal to 0, it might be a bad id
131 * reported by root port.
133 if (!nosourceid
&& (PCI_BUS(e_info
->id
) != 0)) {
134 /* Device ID match? */
135 if (e_info
->id
== ((dev
->bus
->number
<< 8) | dev
->devfn
))
138 /* Continue id comparing if there is no multiple error */
139 if (!e_info
->multi_error_valid
)
146 * 2) bus id is equal to 0. Some ports might lose the bus
147 * id of error source id;
148 * 3) There are multiple errors and prior id comparing fails;
149 * We check AER status registers to find possible reporter.
151 if (atomic_read(&dev
->enable_cnt
) == 0)
153 pos
= pci_pcie_cap(dev
);
157 /* Check if AER is enabled */
158 pci_read_config_word(dev
, pos
+ PCI_EXP_DEVCTL
, ®16
);
160 PCI_EXP_DEVCTL_CERE
|
161 PCI_EXP_DEVCTL_NFERE
|
162 PCI_EXP_DEVCTL_FERE
|
163 PCI_EXP_DEVCTL_URRE
)))
165 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
169 /* Check if error is recorded */
170 if (e_info
->severity
== AER_CORRECTABLE
) {
171 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
, &status
);
172 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
, &mask
);
174 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
, &status
);
175 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
, &mask
);
183 static int find_device_iter(struct pci_dev
*dev
, void *data
)
185 struct aer_err_info
*e_info
= (struct aer_err_info
*)data
;
187 if (is_error_source(dev
, e_info
)) {
188 /* List this device */
189 if (add_error_device(e_info
, dev
)) {
190 /* We cannot handle more... Stop iteration */
191 /* TODO: Should print error message here? */
195 /* If there is only a single error, stop iteration */
196 if (!e_info
->multi_error_valid
)
203 * find_source_device - search through device hierarchy for source device
204 * @parent: pointer to Root Port pci_dev data structure
205 * @e_info: including detailed error information such like id
207 * Return true if found.
209 * Invoked by DPC when error is detected at the Root Port.
210 * Caller of this function must set id, severity, and multi_error_valid of
211 * struct aer_err_info pointed by @e_info properly. This function must fill
212 * e_info->error_dev_num and e_info->dev[], based on the given information.
214 static bool find_source_device(struct pci_dev
*parent
,
215 struct aer_err_info
*e_info
)
217 struct pci_dev
*dev
= parent
;
220 /* Must reset in this function */
221 e_info
->error_dev_num
= 0;
223 /* Is Root Port an agent that sends error message? */
224 result
= find_device_iter(dev
, e_info
);
228 pci_walk_bus(parent
->subordinate
, find_device_iter
, e_info
);
230 if (!e_info
->error_dev_num
) {
231 dev_printk(KERN_DEBUG
, &parent
->dev
,
232 "can't find device of ID%04x\n",
239 static int report_error_detected(struct pci_dev
*dev
, void *data
)
241 pci_ers_result_t vote
;
242 struct pci_error_handlers
*err_handler
;
243 struct aer_broadcast_data
*result_data
;
244 result_data
= (struct aer_broadcast_data
*) data
;
246 dev
->error_state
= result_data
->state
;
249 !dev
->driver
->err_handler
||
250 !dev
->driver
->err_handler
->error_detected
) {
251 if (result_data
->state
== pci_channel_io_frozen
&&
252 !(dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
)) {
254 * In case of fatal recovery, if one of down-
255 * stream device has no driver. We might be
256 * unable to recover because a later insmod
257 * of a driver for this device is unaware of
260 dev_printk(KERN_DEBUG
, &dev
->dev
, "device has %s\n",
262 "no AER-aware driver" : "no driver");
267 err_handler
= dev
->driver
->err_handler
;
268 vote
= err_handler
->error_detected(dev
, result_data
->state
);
269 result_data
->result
= merge_result(result_data
->result
, vote
);
273 static int report_mmio_enabled(struct pci_dev
*dev
, void *data
)
275 pci_ers_result_t vote
;
276 struct pci_error_handlers
*err_handler
;
277 struct aer_broadcast_data
*result_data
;
278 result_data
= (struct aer_broadcast_data
*) data
;
281 !dev
->driver
->err_handler
||
282 !dev
->driver
->err_handler
->mmio_enabled
)
285 err_handler
= dev
->driver
->err_handler
;
286 vote
= err_handler
->mmio_enabled(dev
);
287 result_data
->result
= merge_result(result_data
->result
, vote
);
291 static int report_slot_reset(struct pci_dev
*dev
, void *data
)
293 pci_ers_result_t vote
;
294 struct pci_error_handlers
*err_handler
;
295 struct aer_broadcast_data
*result_data
;
296 result_data
= (struct aer_broadcast_data
*) data
;
299 !dev
->driver
->err_handler
||
300 !dev
->driver
->err_handler
->slot_reset
)
303 err_handler
= dev
->driver
->err_handler
;
304 vote
= err_handler
->slot_reset(dev
);
305 result_data
->result
= merge_result(result_data
->result
, vote
);
309 static int report_resume(struct pci_dev
*dev
, void *data
)
311 struct pci_error_handlers
*err_handler
;
313 dev
->error_state
= pci_channel_io_normal
;
316 !dev
->driver
->err_handler
||
317 !dev
->driver
->err_handler
->resume
)
320 err_handler
= dev
->driver
->err_handler
;
321 err_handler
->resume(dev
);
326 * broadcast_error_message - handle message broadcast to downstream drivers
327 * @dev: pointer to from where in a hierarchy message is broadcasted down
328 * @state: error state
329 * @error_mesg: message to print
330 * @cb: callback to be broadcasted
332 * Invoked during error recovery process. Once being invoked, the content
333 * of error severity will be broadcasted to all downstream drivers in a
334 * hierarchy in question.
336 static pci_ers_result_t
broadcast_error_message(struct pci_dev
*dev
,
337 enum pci_channel_state state
,
339 int (*cb
)(struct pci_dev
*, void *))
341 struct aer_broadcast_data result_data
;
343 dev_printk(KERN_DEBUG
, &dev
->dev
, "broadcast %s message\n", error_mesg
);
344 result_data
.state
= state
;
345 if (cb
== report_error_detected
)
346 result_data
.result
= PCI_ERS_RESULT_CAN_RECOVER
;
348 result_data
.result
= PCI_ERS_RESULT_RECOVERED
;
350 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
) {
352 * If the error is reported by a bridge, we think this error
353 * is related to the downstream link of the bridge, so we
354 * do error recovery on all subordinates of the bridge instead
355 * of the bridge and clear the error status of the bridge.
357 if (cb
== report_error_detected
)
358 dev
->error_state
= state
;
359 pci_walk_bus(dev
->subordinate
, cb
, &result_data
);
360 if (cb
== report_resume
) {
361 pci_cleanup_aer_uncorrect_error_status(dev
);
362 dev
->error_state
= pci_channel_io_normal
;
366 * If the error is reported by an end point, we think this
367 * error is related to the upstream link of the end point.
369 pci_walk_bus(dev
->bus
, cb
, &result_data
);
372 return result_data
.result
;
376 * aer_do_secondary_bus_reset - perform secondary bus reset
377 * @dev: pointer to bridge's pci_dev data structure
379 * Invoked when performing link reset at Root Port or Downstream Port.
381 void aer_do_secondary_bus_reset(struct pci_dev
*dev
)
385 /* Assert Secondary Bus Reset */
386 pci_read_config_word(dev
, PCI_BRIDGE_CONTROL
, &p2p_ctrl
);
387 p2p_ctrl
|= PCI_BRIDGE_CTL_BUS_RESET
;
388 pci_write_config_word(dev
, PCI_BRIDGE_CONTROL
, p2p_ctrl
);
391 * we should send hot reset message for 2ms to allow it time to
392 * propagate to all downstream ports
396 /* De-assert Secondary Bus Reset */
397 p2p_ctrl
&= ~PCI_BRIDGE_CTL_BUS_RESET
;
398 pci_write_config_word(dev
, PCI_BRIDGE_CONTROL
, p2p_ctrl
);
401 * System software must wait for at least 100ms from the end
402 * of a reset of one or more device before it is permitted
403 * to issue Configuration Requests to those devices.
409 * default_downstream_reset_link - default reset function for Downstream Port
410 * @dev: pointer to downstream port's pci_dev data structure
412 * Invoked when performing link reset at Downstream Port w/ no aer driver.
414 static pci_ers_result_t
default_downstream_reset_link(struct pci_dev
*dev
)
416 aer_do_secondary_bus_reset(dev
);
417 dev_printk(KERN_DEBUG
, &dev
->dev
,
418 "Downstream Port link has been reset\n");
419 return PCI_ERS_RESULT_RECOVERED
;
422 static int find_aer_service_iter(struct device
*device
, void *data
)
424 struct pcie_port_service_driver
*service_driver
, **drv
;
426 drv
= (struct pcie_port_service_driver
**) data
;
428 if (device
->bus
== &pcie_port_bus_type
&& device
->driver
) {
429 service_driver
= to_service_driver(device
->driver
);
430 if (service_driver
->service
== PCIE_PORT_SERVICE_AER
) {
431 *drv
= service_driver
;
439 static struct pcie_port_service_driver
*find_aer_service(struct pci_dev
*dev
)
441 struct pcie_port_service_driver
*drv
= NULL
;
443 device_for_each_child(&dev
->dev
, &drv
, find_aer_service_iter
);
448 static pci_ers_result_t
reset_link(struct pcie_device
*aerdev
,
451 struct pci_dev
*udev
;
452 pci_ers_result_t status
;
453 struct pcie_port_service_driver
*driver
;
455 if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
) {
456 /* Reset this port for all subordinates */
459 /* Reset the upstream component (likely downstream port) */
460 udev
= dev
->bus
->self
;
463 /* Use the aer driver of the component firstly */
464 driver
= find_aer_service(udev
);
466 if (driver
&& driver
->reset_link
) {
467 status
= driver
->reset_link(udev
);
468 } else if (udev
->pcie_type
== PCI_EXP_TYPE_DOWNSTREAM
) {
469 status
= default_downstream_reset_link(udev
);
471 dev_printk(KERN_DEBUG
, &dev
->dev
,
472 "no link-reset support at upstream device %s\n",
474 return PCI_ERS_RESULT_DISCONNECT
;
477 if (status
!= PCI_ERS_RESULT_RECOVERED
) {
478 dev_printk(KERN_DEBUG
, &dev
->dev
,
479 "link reset at upstream device %s failed\n",
481 return PCI_ERS_RESULT_DISCONNECT
;
488 * do_recovery - handle nonfatal/fatal error recovery process
489 * @aerdev: pointer to a pcie_device data structure of root port
490 * @dev: pointer to a pci_dev data structure of agent detecting an error
491 * @severity: error severity type
493 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
494 * error detected message to all downstream drivers within a hierarchy in
495 * question and return the returned code.
497 static void do_recovery(struct pcie_device
*aerdev
, struct pci_dev
*dev
,
500 pci_ers_result_t status
, result
= PCI_ERS_RESULT_RECOVERED
;
501 enum pci_channel_state state
;
503 if (severity
== AER_FATAL
)
504 state
= pci_channel_io_frozen
;
506 state
= pci_channel_io_normal
;
508 status
= broadcast_error_message(dev
,
511 report_error_detected
);
513 if (severity
== AER_FATAL
) {
514 result
= reset_link(aerdev
, dev
);
515 if (result
!= PCI_ERS_RESULT_RECOVERED
)
519 if (status
== PCI_ERS_RESULT_CAN_RECOVER
)
520 status
= broadcast_error_message(dev
,
523 report_mmio_enabled
);
525 if (status
== PCI_ERS_RESULT_NEED_RESET
) {
527 * TODO: Should call platform-specific
528 * functions to reset slot before calling
529 * drivers' slot_reset callbacks?
531 status
= broadcast_error_message(dev
,
537 if (status
!= PCI_ERS_RESULT_RECOVERED
)
540 broadcast_error_message(dev
,
545 dev_printk(KERN_DEBUG
, &dev
->dev
,
546 "AER driver successfully recovered\n");
550 /* TODO: Should kernel panic here? */
551 dev_printk(KERN_DEBUG
, &dev
->dev
,
552 "AER driver didn't recover\n");
556 * handle_error_source - handle logging error into an event log
557 * @aerdev: pointer to pcie_device data structure of the root port
558 * @dev: pointer to pci_dev data structure of error source device
559 * @info: comprehensive error information
561 * Invoked when an error being detected by Root Port.
563 static void handle_error_source(struct pcie_device
*aerdev
,
565 struct aer_err_info
*info
)
569 if (info
->severity
== AER_CORRECTABLE
) {
571 * Correctable error does not need software intevention.
572 * No need to go through error recovery process.
574 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
576 pci_write_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
579 do_recovery(aerdev
, dev
, info
->severity
);
583 * get_device_error_info - read error status from dev and store it to info
584 * @dev: pointer to the device expected to have a error record
585 * @info: pointer to structure to store the error record
587 * Return 1 on success, 0 on error.
589 * Note that @info is reused among all error devices. Clear fields properly.
591 static int get_device_error_info(struct pci_dev
*dev
, struct aer_err_info
*info
)
595 /* Must reset in this function */
597 info
->tlp_header_valid
= 0;
599 pos
= pci_find_ext_capability(dev
, PCI_EXT_CAP_ID_ERR
);
601 /* The device might not support AER */
605 if (info
->severity
== AER_CORRECTABLE
) {
606 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_STATUS
,
608 pci_read_config_dword(dev
, pos
+ PCI_ERR_COR_MASK
,
610 if (!(info
->status
& ~info
->mask
))
612 } else if (dev
->hdr_type
& PCI_HEADER_TYPE_BRIDGE
||
613 info
->severity
== AER_NONFATAL
) {
615 /* Link is still healthy for IO reads */
616 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_STATUS
,
618 pci_read_config_dword(dev
, pos
+ PCI_ERR_UNCOR_MASK
,
620 if (!(info
->status
& ~info
->mask
))
623 /* Get First Error Pointer */
624 pci_read_config_dword(dev
, pos
+ PCI_ERR_CAP
, &temp
);
625 info
->first_error
= PCI_ERR_CAP_FEP(temp
);
627 if (info
->status
& AER_LOG_TLP_MASKS
) {
628 info
->tlp_header_valid
= 1;
629 pci_read_config_dword(dev
,
630 pos
+ PCI_ERR_HEADER_LOG
, &info
->tlp
.dw0
);
631 pci_read_config_dword(dev
,
632 pos
+ PCI_ERR_HEADER_LOG
+ 4, &info
->tlp
.dw1
);
633 pci_read_config_dword(dev
,
634 pos
+ PCI_ERR_HEADER_LOG
+ 8, &info
->tlp
.dw2
);
635 pci_read_config_dword(dev
,
636 pos
+ PCI_ERR_HEADER_LOG
+ 12, &info
->tlp
.dw3
);
643 static inline void aer_process_err_devices(struct pcie_device
*p_device
,
644 struct aer_err_info
*e_info
)
648 /* Report all before handle them, not to lost records by reset etc. */
649 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
650 if (get_device_error_info(e_info
->dev
[i
], e_info
))
651 aer_print_error(e_info
->dev
[i
], e_info
);
653 for (i
= 0; i
< e_info
->error_dev_num
&& e_info
->dev
[i
]; i
++) {
654 if (get_device_error_info(e_info
->dev
[i
], e_info
))
655 handle_error_source(p_device
, e_info
->dev
[i
], e_info
);
660 * aer_isr_one_error - consume an error detected by root port
661 * @p_device: pointer to error root port service device
662 * @e_src: pointer to an error source
664 static void aer_isr_one_error(struct pcie_device
*p_device
,
665 struct aer_err_source
*e_src
)
667 struct aer_err_info
*e_info
;
669 /* struct aer_err_info might be big, so we allocate it with slab */
670 e_info
= kmalloc(sizeof(struct aer_err_info
), GFP_KERNEL
);
672 dev_printk(KERN_DEBUG
, &p_device
->port
->dev
,
673 "Can't allocate mem when processing AER errors\n");
678 * There is a possibility that both correctable error and
679 * uncorrectable error being logged. Report correctable error first.
681 if (e_src
->status
& PCI_ERR_ROOT_COR_RCV
) {
682 e_info
->id
= ERR_COR_ID(e_src
->id
);
683 e_info
->severity
= AER_CORRECTABLE
;
685 if (e_src
->status
& PCI_ERR_ROOT_MULTI_COR_RCV
)
686 e_info
->multi_error_valid
= 1;
688 e_info
->multi_error_valid
= 0;
690 aer_print_port_info(p_device
->port
, e_info
);
692 if (find_source_device(p_device
->port
, e_info
))
693 aer_process_err_devices(p_device
, e_info
);
696 if (e_src
->status
& PCI_ERR_ROOT_UNCOR_RCV
) {
697 e_info
->id
= ERR_UNCOR_ID(e_src
->id
);
699 if (e_src
->status
& PCI_ERR_ROOT_FATAL_RCV
)
700 e_info
->severity
= AER_FATAL
;
702 e_info
->severity
= AER_NONFATAL
;
704 if (e_src
->status
& PCI_ERR_ROOT_MULTI_UNCOR_RCV
)
705 e_info
->multi_error_valid
= 1;
707 e_info
->multi_error_valid
= 0;
709 aer_print_port_info(p_device
->port
, e_info
);
711 if (find_source_device(p_device
->port
, e_info
))
712 aer_process_err_devices(p_device
, e_info
);
719 * get_e_source - retrieve an error source
720 * @rpc: pointer to the root port which holds an error
721 * @e_src: pointer to store retrieved error source
723 * Return 1 if an error source is retrieved, otherwise 0.
725 * Invoked by DPC handler to consume an error.
727 static int get_e_source(struct aer_rpc
*rpc
, struct aer_err_source
*e_src
)
732 /* Lock access to Root error producer/consumer index */
733 spin_lock_irqsave(&rpc
->e_lock
, flags
);
734 if (rpc
->prod_idx
!= rpc
->cons_idx
) {
735 *e_src
= rpc
->e_sources
[rpc
->cons_idx
];
737 if (rpc
->cons_idx
== AER_ERROR_SOURCES_MAX
)
741 spin_unlock_irqrestore(&rpc
->e_lock
, flags
);
747 * aer_isr - consume errors detected by root port
748 * @work: definition of this work item
750 * Invoked, as DPC, when root port records new detected error
752 void aer_isr(struct work_struct
*work
)
754 struct aer_rpc
*rpc
= container_of(work
, struct aer_rpc
, dpc_handler
);
755 struct pcie_device
*p_device
= rpc
->rpd
;
756 struct aer_err_source e_src
;
758 mutex_lock(&rpc
->rpc_mutex
);
759 while (get_e_source(rpc
, &e_src
))
760 aer_isr_one_error(p_device
, &e_src
);
761 mutex_unlock(&rpc
->rpc_mutex
);
763 wake_up(&rpc
->wait_release
);
767 * aer_init - provide AER initialization
768 * @dev: pointer to AER pcie device
770 * Invoked when AER service driver is loaded.
772 int aer_init(struct pcie_device
*dev
)
774 if (pcie_aer_get_firmware_first(dev
->port
)) {
775 dev_printk(KERN_DEBUG
, &dev
->device
,
776 "PCIe errors handled by platform firmware.\n");
780 if (aer_osc_setup(dev
))
786 dev_printk(KERN_DEBUG
, &dev
->device
,
787 "aerdrv forceload requested.\n");
788 pcie_aer_force_firmware_first(dev
->port
, 0);