2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3 * Copyright IBM Corp. 2004 2005
4 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or (at
11 * your option) any later version.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
25 #include <linux/delay.h>
26 #include <linux/interrupt.h>
27 #include <linux/irq.h>
28 #include <linux/pci.h>
30 #include <asm/eeh_event.h>
31 #include <asm/ppc-pci.h>
32 #include <asm/pci-bridge.h>
37 * eeh_pcid_name - Retrieve name of PCI device driver
40 * This routine is used to retrieve the name of PCI device driver
43 static inline const char *eeh_pcid_name(struct pci_dev
*pdev
)
45 if (pdev
&& pdev
->dev
.driver
)
46 return pdev
->dev
.driver
->name
;
51 static void print_device_node_tree(struct pci_dn
*pdn
, int dent
)
54 struct device_node
*pc
;
58 for (i
= 0; i
< dent
; i
++)
60 printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
61 pdn
->node
->name
, pdn
->eeh_mode
, pdn
->eeh_config_addr
,
62 pdn
->eeh_pe_config_addr
, pdn
->node
->full_name
);
64 pc
= pdn
->node
->child
;
66 print_device_node_tree(PCI_DN(pc
), dent
);
73 * eeh_disable_irq - Disable interrupt for the recovering device
76 * This routine must be called when reporting temporary or permanent
77 * error to the particular PCI device to disable interrupt of that
78 * device. If the device has enabled MSI or MSI-X interrupt, we needn't
79 * do real work because EEH should freeze DMA transfers for those PCI
80 * devices encountering EEH errors, which includes MSI or MSI-X.
82 static void eeh_disable_irq(struct pci_dev
*dev
)
84 struct eeh_dev
*edev
= pci_dev_to_eeh_dev(dev
);
86 /* Don't disable MSI and MSI-X interrupts. They are
87 * effectively disabled by the DMA Stopped state
88 * when an EEH error occurs.
90 if (dev
->msi_enabled
|| dev
->msix_enabled
)
93 if (!irq_has_action(dev
->irq
))
96 edev
->mode
|= EEH_MODE_IRQ_DISABLED
;
97 disable_irq_nosync(dev
->irq
);
101 * eeh_enable_irq - Enable interrupt for the recovering device
104 * This routine must be called to enable interrupt while failed
105 * device could be resumed.
107 static void eeh_enable_irq(struct pci_dev
*dev
)
109 struct eeh_dev
*edev
= pci_dev_to_eeh_dev(dev
);
111 if ((edev
->mode
) & EEH_MODE_IRQ_DISABLED
) {
112 edev
->mode
&= ~EEH_MODE_IRQ_DISABLED
;
113 enable_irq(dev
->irq
);
118 * eeh_report_error - Report pci error to each device driver
120 * @userdata: return value
122 * Report an EEH error to each device driver, collect up and
123 * merge the device driver responses. Cumulative response
124 * passed back in "userdata".
126 static int eeh_report_error(struct pci_dev
*dev
, void *userdata
)
128 enum pci_ers_result rc
, *res
= userdata
;
129 struct pci_driver
*driver
= dev
->driver
;
131 dev
->error_state
= pci_channel_io_frozen
;
136 eeh_disable_irq(dev
);
138 if (!driver
->err_handler
||
139 !driver
->err_handler
->error_detected
)
142 rc
= driver
->err_handler
->error_detected(dev
, pci_channel_io_frozen
);
144 /* A driver that needs a reset trumps all others */
145 if (rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
146 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
152 * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
154 * @userdata: return value
156 * Tells each device driver that IO ports, MMIO and config space I/O
157 * are now enabled. Collects up and merges the device driver responses.
158 * Cumulative response passed back in "userdata".
160 static int eeh_report_mmio_enabled(struct pci_dev
*dev
, void *userdata
)
162 enum pci_ers_result rc
, *res
= userdata
;
163 struct pci_driver
*driver
= dev
->driver
;
166 !driver
->err_handler
||
167 !driver
->err_handler
->mmio_enabled
)
170 rc
= driver
->err_handler
->mmio_enabled(dev
);
172 /* A driver that needs a reset trumps all others */
173 if (rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
174 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
180 * eeh_report_reset - Tell device that slot has been reset
182 * @userdata: return value
184 * This routine must be called while EEH tries to reset particular
185 * PCI device so that the associated PCI device driver could take
186 * some actions, usually to save data the driver needs so that the
187 * driver can work again while the device is recovered.
189 static int eeh_report_reset(struct pci_dev
*dev
, void *userdata
)
191 enum pci_ers_result rc
, *res
= userdata
;
192 struct pci_driver
*driver
= dev
->driver
;
197 dev
->error_state
= pci_channel_io_normal
;
201 if (!driver
->err_handler
||
202 !driver
->err_handler
->slot_reset
)
205 rc
= driver
->err_handler
->slot_reset(dev
);
206 if ((*res
== PCI_ERS_RESULT_NONE
) ||
207 (*res
== PCI_ERS_RESULT_RECOVERED
)) *res
= rc
;
208 if (*res
== PCI_ERS_RESULT_DISCONNECT
&&
209 rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
215 * eeh_report_resume - Tell device to resume normal operations
217 * @userdata: return value
219 * This routine must be called to notify the device driver that it
220 * could resume so that the device driver can do some initialization
221 * to make the recovered device work again.
223 static int eeh_report_resume(struct pci_dev
*dev
, void *userdata
)
225 struct pci_driver
*driver
= dev
->driver
;
227 dev
->error_state
= pci_channel_io_normal
;
234 if (!driver
->err_handler
||
235 !driver
->err_handler
->resume
)
238 driver
->err_handler
->resume(dev
);
244 * eeh_report_failure - Tell device driver that device is dead.
246 * @userdata: return value
248 * This informs the device driver that the device is permanently
249 * dead, and that no further recovery attempts will be made on it.
251 static int eeh_report_failure(struct pci_dev
*dev
, void *userdata
)
253 struct pci_driver
*driver
= dev
->driver
;
255 dev
->error_state
= pci_channel_io_perm_failure
;
260 eeh_disable_irq(dev
);
262 if (!driver
->err_handler
||
263 !driver
->err_handler
->error_detected
)
266 driver
->err_handler
->error_detected(dev
, pci_channel_io_perm_failure
);
272 * eeh_reset_device - Perform actual reset of a pci slot
273 * @edev: PE associated EEH device
274 * @bus: PCI bus corresponding to the isolcated slot
276 * This routine must be called to do reset on the indicated PE.
277 * During the reset, udev might be invoked because those affected
278 * PCI devices will be removed and then added.
280 static int eeh_reset_device(struct eeh_dev
*edev
, struct pci_bus
*bus
)
282 struct device_node
*dn
;
285 /* pcibios will clear the counter; save the value */
286 cnt
= edev
->freeze_count
;
289 pcibios_remove_pci_devices(bus
);
291 /* Reset the pci controller. (Asserts RST#; resets config space).
292 * Reconfigure bridges and devices. Don't try to bring the system
293 * up if the reset failed for some reason.
295 rc
= eeh_reset_pe(edev
);
299 /* Walk over all functions on this device. */
300 dn
= eeh_dev_to_of_node(edev
);
301 if (!pcibios_find_pci_bus(dn
) && of_node_to_eeh_dev(dn
->parent
))
302 dn
= dn
->parent
->child
;
305 struct eeh_dev
*pedev
= of_node_to_eeh_dev(dn
);
307 /* On Power4, always true because eeh_pe_config_addr=0 */
308 if (edev
->pe_config_addr
== pedev
->pe_config_addr
) {
309 eeh_ops
->configure_bridge(dn
);
310 eeh_restore_bars(pedev
);
315 /* Give the system 5 seconds to finish running the user-space
316 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
317 * this is a hack, but if we don't do this, and try to bring
318 * the device up before the scripts have taken it down,
319 * potentially weird things happen.
323 pcibios_add_pci_devices(bus
);
325 edev
->freeze_count
= cnt
;
330 /* The longest amount of time to wait for a pci device
331 * to come back on line, in seconds.
333 #define MAX_WAIT_FOR_RECOVERY 150
336 * eeh_handle_event - Reset a PCI device after hard lockup.
339 * While PHB detects address or data parity errors on particular PCI
340 * slot, the associated PE will be frozen. Besides, DMA's occurring
341 * to wild addresses (which usually happen due to bugs in device
342 * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
343 * #PERR or other misc PCI-related errors also can trigger EEH errors.
345 * Recovery process consists of unplugging the device driver (which
346 * generated hotplug events to userspace), then issuing a PCI #RST to
347 * the device, then reconfiguring the PCI config space for all bridges
348 * & devices under this slot, and then finally restarting the device
349 * drivers (which cause a second set of hotplug events to go out to
352 struct eeh_dev
*handle_eeh_events(struct eeh_event
*event
)
354 struct device_node
*frozen_dn
;
355 struct eeh_dev
*frozen_edev
;
356 struct pci_bus
*frozen_bus
;
358 enum pci_ers_result result
= PCI_ERS_RESULT_NONE
;
359 const char *location
, *pci_str
, *drv_str
, *bus_pci_str
, *bus_drv_str
;
361 frozen_dn
= eeh_find_device_pe(eeh_dev_to_of_node(event
->edev
));
363 location
= of_get_property(eeh_dev_to_of_node(event
->edev
), "ibm,loc-code", NULL
);
364 location
= location
? location
: "unknown";
365 printk(KERN_ERR
"EEH: Error: Cannot find partition endpoint "
366 "for location=%s pci addr=%s\n",
367 location
, eeh_pci_name(eeh_dev_to_pci_dev(event
->edev
)));
371 frozen_bus
= pcibios_find_pci_bus(frozen_dn
);
372 location
= of_get_property(frozen_dn
, "ibm,loc-code", NULL
);
373 location
= location
? location
: "unknown";
375 /* There are two different styles for coming up with the PE.
376 * In the old style, it was the highest EEH-capable device
377 * which was always an EADS pci bridge. In the new style,
378 * there might not be any EADS bridges, and even when there are,
379 * the firmware marks them as "EEH incapable". So another
380 * two-step is needed to find the pci bus..
383 frozen_bus
= pcibios_find_pci_bus(frozen_dn
->parent
);
386 printk(KERN_ERR
"EEH: Cannot find PCI bus "
387 "for location=%s dn=%s\n",
388 location
, frozen_dn
->full_name
);
392 frozen_edev
= of_node_to_eeh_dev(frozen_dn
);
393 frozen_edev
->freeze_count
++;
394 pci_str
= eeh_pci_name(eeh_dev_to_pci_dev(event
->edev
));
395 drv_str
= eeh_pcid_name(eeh_dev_to_pci_dev(event
->edev
));
397 if (frozen_edev
->freeze_count
> EEH_MAX_ALLOWED_FREEZES
)
398 goto excess_failures
;
401 "EEH: This PCI device has failed %d times in the last hour:\n",
402 frozen_edev
->freeze_count
);
404 if (frozen_edev
->pdev
) {
405 bus_pci_str
= pci_name(frozen_edev
->pdev
);
406 bus_drv_str
= eeh_pcid_name(frozen_edev
->pdev
);
408 "EEH: Bus location=%s driver=%s pci addr=%s\n",
409 location
, bus_drv_str
, bus_pci_str
);
413 "EEH: Device location=%s driver=%s pci addr=%s\n",
414 location
, drv_str
, pci_str
);
416 /* Walk the various device drivers attached to this slot through
417 * a reset sequence, giving each an opportunity to do what it needs
418 * to accomplish the reset. Each child gets a report of the
419 * status ... if any child can't handle the reset, then the entire
420 * slot is dlpar removed and added.
422 pci_walk_bus(frozen_bus
, eeh_report_error
, &result
);
424 /* Get the current PCI slot state. This can take a long time,
425 * sometimes over 3 seconds for certain systems.
427 rc
= eeh_ops
->wait_state(eeh_dev_to_of_node(frozen_edev
), MAX_WAIT_FOR_RECOVERY
*1000);
428 if (rc
< 0 || rc
== EEH_STATE_NOT_SUPPORT
) {
429 printk(KERN_WARNING
"EEH: Permanent failure\n");
433 /* Since rtas may enable MMIO when posting the error log,
434 * don't post the error log until after all dev drivers
435 * have been informed.
437 eeh_slot_error_detail(frozen_edev
, EEH_LOG_TEMP
);
439 /* If all device drivers were EEH-unaware, then shut
440 * down all of the device drivers, and hope they
441 * go down willingly, without panicing the system.
443 if (result
== PCI_ERS_RESULT_NONE
) {
444 rc
= eeh_reset_device(frozen_edev
, frozen_bus
);
446 printk(KERN_WARNING
"EEH: Unable to reset, rc=%d\n", rc
);
451 /* If all devices reported they can proceed, then re-enable MMIO */
452 if (result
== PCI_ERS_RESULT_CAN_RECOVER
) {
453 rc
= eeh_pci_enable(frozen_edev
, EEH_OPT_THAW_MMIO
);
458 result
= PCI_ERS_RESULT_NEED_RESET
;
460 result
= PCI_ERS_RESULT_NONE
;
461 pci_walk_bus(frozen_bus
, eeh_report_mmio_enabled
, &result
);
465 /* If all devices reported they can proceed, then re-enable DMA */
466 if (result
== PCI_ERS_RESULT_CAN_RECOVER
) {
467 rc
= eeh_pci_enable(frozen_edev
, EEH_OPT_THAW_DMA
);
472 result
= PCI_ERS_RESULT_NEED_RESET
;
474 result
= PCI_ERS_RESULT_RECOVERED
;
477 /* If any device has a hard failure, then shut off everything. */
478 if (result
== PCI_ERS_RESULT_DISCONNECT
) {
479 printk(KERN_WARNING
"EEH: Device driver gave up\n");
483 /* If any device called out for a reset, then reset the slot */
484 if (result
== PCI_ERS_RESULT_NEED_RESET
) {
485 rc
= eeh_reset_device(frozen_edev
, NULL
);
487 printk(KERN_WARNING
"EEH: Cannot reset, rc=%d\n", rc
);
490 result
= PCI_ERS_RESULT_NONE
;
491 pci_walk_bus(frozen_bus
, eeh_report_reset
, &result
);
494 /* All devices should claim they have recovered by now. */
495 if ((result
!= PCI_ERS_RESULT_RECOVERED
) &&
496 (result
!= PCI_ERS_RESULT_NONE
)) {
497 printk(KERN_WARNING
"EEH: Not recovered\n");
501 /* Tell all device drivers that they can resume operations */
502 pci_walk_bus(frozen_bus
, eeh_report_resume
, NULL
);
508 * About 90% of all real-life EEH failures in the field
509 * are due to poorly seated PCI cards. Only 10% or so are
510 * due to actual, failed cards.
513 "EEH: PCI device at location=%s driver=%s pci addr=%s\n"
514 "has failed %d times in the last hour "
515 "and has been permanently disabled.\n"
516 "Please try reseating this device or replacing it.\n",
517 location
, drv_str
, pci_str
, frozen_edev
->freeze_count
);
522 "EEH: Unable to recover from failure of PCI device "
523 "at location=%s driver=%s pci addr=%s\n"
524 "Please try reseating this device or replacing it.\n",
525 location
, drv_str
, pci_str
);
528 eeh_slot_error_detail(frozen_edev
, EEH_LOG_PERM
);
530 /* Notify all devices that they're about to go down. */
531 pci_walk_bus(frozen_bus
, eeh_report_failure
, NULL
);
533 /* Shut down the device drivers for good. */
534 pcibios_remove_pci_devices(frozen_bus
);