2 * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
3 * Copyright (C) 2004, 2005 Linas Vepstas <linas@linas.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or (at
10 * your option) any later version.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 * Send feedback to <linas@us.ibm.com>
25 #include <linux/delay.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/notifier.h>
29 #include <linux/pci.h>
31 #include <asm/eeh_event.h>
32 #include <asm/ppc-pci.h>
33 #include <asm/pci-bridge.h>
38 static inline const char * pcid_name (struct pci_dev
*pdev
)
41 return pdev
->dev
.driver
->name
;
46 static void print_device_node_tree (struct pci_dn
*pdn
, int dent
)
52 printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
53 pdn
->node
->name
, pdn
->eeh_mode
, pdn
->eeh_config_addr
,
54 pdn
->eeh_pe_config_addr
, pdn
->node
->full_name
);
56 struct device_node
*pc
= pdn
->node
->child
;
58 print_device_node_tree(PCI_DN(pc
), dent
);
65 * irq_in_use - return true if this irq is being used
67 static int irq_in_use(unsigned int irq
)
71 struct irq_desc
*desc
= irq_desc
+ irq
;
73 spin_lock_irqsave(&desc
->lock
, flags
);
76 spin_unlock_irqrestore(&desc
->lock
, flags
);
80 /* ------------------------------------------------------- */
81 /** eeh_report_error - report an EEH error to each device,
82 * collect up and merge the device responses.
85 static void eeh_report_error(struct pci_dev
*dev
, void *userdata
)
87 enum pci_ers_result rc
, *res
= userdata
;
88 struct pci_driver
*driver
= dev
->driver
;
90 dev
->error_state
= pci_channel_io_frozen
;
95 if (irq_in_use (dev
->irq
)) {
96 struct device_node
*dn
= pci_device_to_OF_node(dev
);
97 PCI_DN(dn
)->eeh_mode
|= EEH_MODE_IRQ_DISABLED
;
98 disable_irq_nosync(dev
->irq
);
100 if (!driver
->err_handler
)
102 if (!driver
->err_handler
->error_detected
)
105 rc
= driver
->err_handler
->error_detected (dev
, pci_channel_io_frozen
);
106 if (*res
== PCI_ERS_RESULT_NONE
) *res
= rc
;
107 if (*res
== PCI_ERS_RESULT_NEED_RESET
) return;
108 if (*res
== PCI_ERS_RESULT_DISCONNECT
&&
109 rc
== PCI_ERS_RESULT_NEED_RESET
) *res
= rc
;
112 /** eeh_report_reset -- tell this device that the pci slot
116 static void eeh_report_reset(struct pci_dev
*dev
, void *userdata
)
118 struct pci_driver
*driver
= dev
->driver
;
119 struct device_node
*dn
= pci_device_to_OF_node(dev
);
124 if ((PCI_DN(dn
)->eeh_mode
) & EEH_MODE_IRQ_DISABLED
) {
125 PCI_DN(dn
)->eeh_mode
&= ~EEH_MODE_IRQ_DISABLED
;
126 enable_irq(dev
->irq
);
128 if (!driver
->err_handler
)
130 if (!driver
->err_handler
->slot_reset
)
133 driver
->err_handler
->slot_reset(dev
);
136 static void eeh_report_resume(struct pci_dev
*dev
, void *userdata
)
138 struct pci_driver
*driver
= dev
->driver
;
140 dev
->error_state
= pci_channel_io_normal
;
144 if (!driver
->err_handler
)
146 if (!driver
->err_handler
->resume
)
149 driver
->err_handler
->resume(dev
);
152 static void eeh_report_failure(struct pci_dev
*dev
, void *userdata
)
154 struct pci_driver
*driver
= dev
->driver
;
156 dev
->error_state
= pci_channel_io_perm_failure
;
161 if (irq_in_use (dev
->irq
)) {
162 struct device_node
*dn
= pci_device_to_OF_node(dev
);
163 PCI_DN(dn
)->eeh_mode
|= EEH_MODE_IRQ_DISABLED
;
164 disable_irq_nosync(dev
->irq
);
166 if (!driver
->err_handler
)
168 if (!driver
->err_handler
->error_detected
)
170 driver
->err_handler
->error_detected(dev
, pci_channel_io_perm_failure
);
173 /* ------------------------------------------------------- */
175 * handle_eeh_events -- reset a PCI device after hard lockup.
177 * pSeries systems will isolate a PCI slot if the PCI-Host
178 * bridge detects address or data parity errors, DMA's
179 * occuring to wild addresses (which usually happen due to
180 * bugs in device drivers or in PCI adapter firmware).
181 * Slot isolations also occur if #SERR, #PERR or other misc
182 * PCI-related errors are detected.
184 * Recovery process consists of unplugging the device driver
185 * (which generated hotplug events to userspace), then issuing
186 * a PCI #RST to the device, then reconfiguring the PCI config
187 * space for all bridges & devices under this slot, and then
188 * finally restarting the device drivers (which cause a second
189 * set of hotplug events to go out to userspace).
193 * eeh_reset_device() -- perform actual reset of a pci slot
194 * Args: bus: pointer to the pci bus structure corresponding
195 * to the isolated slot. A non-null value will
196 * cause all devices under the bus to be removed
198 * pe_dn: pointer to a "Partionable Endpoint" device node.
199 * This is the top-level structure on which pci
200 * bus resets can be performed.
203 static int eeh_reset_device (struct pci_dn
*pe_dn
, struct pci_bus
*bus
)
207 pcibios_remove_pci_devices(bus
);
209 /* Reset the pci controller. (Asserts RST#; resets config space).
210 * Reconfigure bridges and devices. Don't try to bring the system
211 * up if the reset failed for some reason. */
212 rc
= rtas_set_slot_reset(pe_dn
);
216 /* New-style config addrs might be shared across multiple devices,
217 * Walk over all functions on this device */
218 if (pe_dn
->eeh_pe_config_addr
) {
219 struct device_node
*pe
= pe_dn
->node
;
220 pe
= pe
->parent
->child
;
222 struct pci_dn
*ppe
= PCI_DN(pe
);
223 if (pe_dn
->eeh_pe_config_addr
== ppe
->eeh_pe_config_addr
) {
224 rtas_configure_bridge(ppe
);
225 eeh_restore_bars(ppe
);
230 rtas_configure_bridge(pe_dn
);
231 eeh_restore_bars(pe_dn
);
234 /* Give the system 5 seconds to finish running the user-space
235 * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
236 * this is a hack, but if we don't do this, and try to bring
237 * the device up before the scripts have taken it down,
238 * potentially weird things happen.
242 pcibios_add_pci_devices(bus
);
248 /* The longest amount of time to wait for a pci device
249 * to come back on line, in seconds.
251 #define MAX_WAIT_FOR_RECOVERY 15
253 void handle_eeh_events (struct eeh_event
*event
)
255 struct device_node
*frozen_dn
;
256 struct pci_dn
*frozen_pdn
;
257 struct pci_bus
*frozen_bus
;
259 enum pci_ers_result result
= PCI_ERS_RESULT_NONE
;
261 frozen_dn
= find_device_pe(event
->dn
);
262 frozen_bus
= pcibios_find_pci_bus(frozen_dn
);
265 printk(KERN_ERR
"EEH: Error: Cannot find partition endpoint for %s\n",
266 pci_name(event
->dev
));
270 /* There are two different styles for coming up with the PE.
271 * In the old style, it was the highest EEH-capable device
272 * which was always an EADS pci bridge. In the new style,
273 * there might not be any EADS bridges, and even when there are,
274 * the firmware marks them as "EEH incapable". So another
275 * two-step is needed to find the pci bus.. */
277 frozen_bus
= pcibios_find_pci_bus (frozen_dn
->parent
);
280 printk(KERN_ERR
"EEH: Cannot find PCI bus for %s\n",
281 frozen_dn
->full_name
);
286 /* We may get "permanent failure" messages on empty slots.
287 * These are false alarms. Empty slots have no child dn. */
288 if ((event
->state
== pci_channel_io_perm_failure
) && (frozen_device
== NULL
))
292 frozen_pdn
= PCI_DN(frozen_dn
);
293 frozen_pdn
->eeh_freeze_count
++;
295 if (frozen_pdn
->eeh_freeze_count
> EEH_MAX_ALLOWED_FREEZES
)
298 /* If the reset state is a '5' and the time to reset is 0 (infinity)
299 * or is more then 15 seconds, then mark this as a permanent failure.
301 if ((event
->state
== pci_channel_io_perm_failure
) &&
302 ((event
->time_unavail
<= 0) ||
303 (event
->time_unavail
> MAX_WAIT_FOR_RECOVERY
*1000)))
306 eeh_slot_error_detail(frozen_pdn
, 1 /* Temporary Error */);
308 "EEH: This PCI device has failed %d times since last reboot: %s - %s\n",
309 frozen_pdn
->eeh_freeze_count
,
310 pci_name (frozen_pdn
->pcidev
),
311 pcid_name(frozen_pdn
->pcidev
));
313 /* Walk the various device drivers attached to this slot through
314 * a reset sequence, giving each an opportunity to do what it needs
315 * to accomplish the reset. Each child gets a report of the
316 * status ... if any child can't handle the reset, then the entire
317 * slot is dlpar removed and added.
319 pci_walk_bus(frozen_bus
, eeh_report_error
, &result
);
321 /* If all device drivers were EEH-unaware, then shut
322 * down all of the device drivers, and hope they
323 * go down willingly, without panicing the system.
325 if (result
== PCI_ERS_RESULT_NONE
) {
326 rc
= eeh_reset_device(frozen_pdn
, frozen_bus
);
331 /* If any device called out for a reset, then reset the slot */
332 if (result
== PCI_ERS_RESULT_NEED_RESET
) {
333 rc
= eeh_reset_device(frozen_pdn
, NULL
);
336 pci_walk_bus(frozen_bus
, eeh_report_reset
, 0);
339 /* If all devices reported they can proceed, the re-enable PIO */
340 if (result
== PCI_ERS_RESULT_CAN_RECOVER
) {
341 /* XXX Not supported; we brute-force reset the device */
342 rc
= eeh_reset_device(frozen_pdn
, NULL
);
345 pci_walk_bus(frozen_bus
, eeh_report_reset
, 0);
348 /* Tell all device drivers that they can resume operations */
349 pci_walk_bus(frozen_bus
, eeh_report_resume
, 0);
355 * About 90% of all real-life EEH failures in the field
356 * are due to poorly seated PCI cards. Only 10% or so are
357 * due to actual, failed cards.
360 "EEH: PCI device %s - %s has failed %d times \n"
361 "and has been permanently disabled. Please try reseating\n"
362 "this device or replacing it.\n",
363 pci_name (frozen_pdn
->pcidev
),
364 pcid_name(frozen_pdn
->pcidev
),
365 frozen_pdn
->eeh_freeze_count
);
367 eeh_slot_error_detail(frozen_pdn
, 2 /* Permanent Error */);
369 /* Notify all devices that they're about to go down. */
370 pci_walk_bus(frozen_bus
, eeh_report_failure
, 0);
372 /* Shut down the device drivers for good. */
373 pcibios_remove_pci_devices(frozen_bus
);
376 /* ---------- end of file ---------- */