1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * The file intends to implement PE based on the information from
4 * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
5 * All the PEs should be organized as hierarchy tree. The first level
6 * of the tree will be associated to existing PHBs since the particular
7 * PE is only meaningful in one PHB domain.
9 * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
12 #include <linux/delay.h>
13 #include <linux/export.h>
14 #include <linux/gfp.h>
15 #include <linux/kernel.h>
16 #include <linux/pci.h>
17 #include <linux/string.h>
19 #include <asm/pci-bridge.h>
20 #include <asm/ppc-pci.h>
22 static int eeh_pe_aux_size
= 0;
23 static LIST_HEAD(eeh_phb_pe
);
26 * eeh_set_pe_aux_size - Set PE auxillary data size
27 * @size: PE auxillary data size
29 * Set PE auxillary data size
31 void eeh_set_pe_aux_size(int size
)
36 eeh_pe_aux_size
= size
;
40 * eeh_pe_alloc - Allocate PE
41 * @phb: PCI controller
44 * Allocate PE instance dynamically.
46 static struct eeh_pe
*eeh_pe_alloc(struct pci_controller
*phb
, int type
)
51 alloc_size
= sizeof(struct eeh_pe
);
52 if (eeh_pe_aux_size
) {
53 alloc_size
= ALIGN(alloc_size
, cache_line_size());
54 alloc_size
+= eeh_pe_aux_size
;
58 pe
= kzalloc(alloc_size
, GFP_KERNEL
);
61 /* Initialize PHB PE */
64 INIT_LIST_HEAD(&pe
->child_list
);
65 INIT_LIST_HEAD(&pe
->edevs
);
67 pe
->data
= (void *)pe
+ ALIGN(sizeof(struct eeh_pe
),
73 * eeh_phb_pe_create - Create PHB PE
74 * @phb: PCI controller
76 * The function should be called while the PHB is detected during
77 * system boot or PCI hotplug in order to create PHB PE.
79 int eeh_phb_pe_create(struct pci_controller
*phb
)
84 pe
= eeh_pe_alloc(phb
, EEH_PE_PHB
);
86 pr_err("%s: out of memory!\n", __func__
);
90 /* Put it into the list */
91 list_add_tail(&pe
->child
, &eeh_phb_pe
);
93 pr_debug("EEH: Add PE for PHB#%x\n", phb
->global_number
);
99 * eeh_wait_state - Wait for PE state
101 * @max_wait: maximal period in millisecond
103 * Wait for the state of associated PE. It might take some time
104 * to retrieve the PE's state.
106 int eeh_wait_state(struct eeh_pe
*pe
, int max_wait
)
112 * According to PAPR, the state of PE might be temporarily
113 * unavailable. Under the circumstance, we have to wait
114 * for indicated time determined by firmware. The maximal
115 * wait time is 5 minutes, which is acquired from the original
116 * EEH implementation. Also, the original implementation
117 * also defined the minimal wait time as 1 second.
119 #define EEH_STATE_MIN_WAIT_TIME (1000)
120 #define EEH_STATE_MAX_WAIT_TIME (300 * 1000)
123 ret
= eeh_ops
->get_state(pe
, &mwait
);
125 if (ret
!= EEH_STATE_UNAVAILABLE
)
129 pr_warn("%s: Timeout when getting PE's state (%d)\n",
131 return EEH_STATE_NOT_SUPPORT
;
134 if (mwait
< EEH_STATE_MIN_WAIT_TIME
) {
135 pr_warn("%s: Firmware returned bad wait value %d\n",
137 mwait
= EEH_STATE_MIN_WAIT_TIME
;
138 } else if (mwait
> EEH_STATE_MAX_WAIT_TIME
) {
139 pr_warn("%s: Firmware returned too long wait value %d\n",
141 mwait
= EEH_STATE_MAX_WAIT_TIME
;
144 msleep(min(mwait
, max_wait
));
150 * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
151 * @phb: PCI controller
153 * The overall PEs form hierarchy tree. The first layer of the
154 * hierarchy tree is composed of PHB PEs. The function is used
155 * to retrieve the corresponding PHB PE according to the given PHB.
157 struct eeh_pe
*eeh_phb_pe_get(struct pci_controller
*phb
)
161 list_for_each_entry(pe
, &eeh_phb_pe
, child
) {
163 * Actually, we needn't check the type since
164 * the PE for PHB has been determined when that
167 if ((pe
->type
& EEH_PE_PHB
) && pe
->phb
== phb
)
175 * eeh_pe_next - Retrieve the next PE in the tree
179 * The function is used to retrieve the next PE in the
182 struct eeh_pe
*eeh_pe_next(struct eeh_pe
*pe
, struct eeh_pe
*root
)
184 struct list_head
*next
= pe
->child_list
.next
;
186 if (next
== &pe
->child_list
) {
190 next
= pe
->child
.next
;
191 if (next
!= &pe
->parent
->child_list
)
197 return list_entry(next
, struct eeh_pe
, child
);
201 * eeh_pe_traverse - Traverse PEs in the specified PHB
204 * @flag: extra parameter to callback
206 * The function is used to traverse the specified PE and its
207 * child PEs. The traversing is to be terminated once the
208 * callback returns something other than NULL, or no more PEs
211 void *eeh_pe_traverse(struct eeh_pe
*root
,
212 eeh_pe_traverse_func fn
, void *flag
)
217 eeh_for_each_pe(root
, pe
) {
226 * eeh_pe_dev_traverse - Traverse the devices from the PE
228 * @fn: function callback
229 * @flag: extra parameter to callback
231 * The function is used to traverse the devices of the specified
232 * PE and its child PEs.
234 void eeh_pe_dev_traverse(struct eeh_pe
*root
,
235 eeh_edev_traverse_func fn
, void *flag
)
238 struct eeh_dev
*edev
, *tmp
;
241 pr_warn("%s: Invalid PE %p\n",
246 /* Traverse root PE */
247 eeh_for_each_pe(root
, pe
)
248 eeh_pe_for_each_dev(pe
, edev
, tmp
)
253 * __eeh_pe_get - Check the PE address
255 * For one particular PE, it can be identified by PE address
256 * or tranditional BDF address. BDF address is composed of
257 * Bus/Device/Function number. The extra data referred by flag
258 * indicates which type of address should be used.
260 static void *__eeh_pe_get(struct eeh_pe
*pe
, void *flag
)
262 int *target_pe
= flag
;
264 /* PHB PEs are special and should be ignored */
265 if (pe
->type
& EEH_PE_PHB
)
268 if (*target_pe
== pe
->addr
)
275 * eeh_pe_get - Search PE based on the given address
276 * @phb: PCI controller
279 * Search the corresponding PE based on the specified address which
280 * is included in the eeh device. The function is used to check if
281 * the associated PE has been created against the PE address. It's
282 * notable that the PE address has 2 format: traditional PE address
283 * which is composed of PCI bus/device/function number, or unified
286 struct eeh_pe
*eeh_pe_get(struct pci_controller
*phb
, int pe_no
)
288 struct eeh_pe
*root
= eeh_phb_pe_get(phb
);
290 return eeh_pe_traverse(root
, __eeh_pe_get
, &pe_no
);
294 * eeh_pe_tree_insert - Add EEH device to parent PE
296 * @new_pe_parent: PE to create additional PEs under
298 * Add EEH device to the PE in edev->pe_config_addr. If a PE already
299 * exists with that address then @edev is added to that PE. Otherwise
300 * a new PE is created and inserted into the PE tree as a child of
303 * If @new_pe_parent is NULL then the new PE will be inserted under
304 * directly under the the PHB.
306 int eeh_pe_tree_insert(struct eeh_dev
*edev
, struct eeh_pe
*new_pe_parent
)
308 struct pci_controller
*hose
= edev
->controller
;
309 struct eeh_pe
*pe
, *parent
;
312 * Search the PE has been existing or not according
313 * to the PE address. If that has been existing, the
314 * PE should be composed of PCI bus and its subordinate
317 pe
= eeh_pe_get(hose
, edev
->pe_config_addr
);
319 if (pe
->type
& EEH_PE_INVALID
) {
320 list_add_tail(&edev
->entry
, &pe
->edevs
);
323 * We're running to here because of PCI hotplug caused by
324 * EEH recovery. We need clear EEH_PE_INVALID until the top.
328 if (!(parent
->type
& EEH_PE_INVALID
))
330 parent
->type
&= ~EEH_PE_INVALID
;
331 parent
= parent
->parent
;
334 eeh_edev_dbg(edev
, "Added to existing PE (parent: PE#%x)\n",
337 /* Mark the PE as type of PCI bus */
338 pe
->type
= EEH_PE_BUS
;
341 /* Put the edev to PE */
342 list_add_tail(&edev
->entry
, &pe
->edevs
);
343 eeh_edev_dbg(edev
, "Added to bus PE\n");
348 /* Create a new EEH PE */
350 pe
= eeh_pe_alloc(hose
, EEH_PE_VF
);
352 pe
= eeh_pe_alloc(hose
, EEH_PE_DEVICE
);
354 pr_err("%s: out of memory!\n", __func__
);
358 pe
->addr
= edev
->pe_config_addr
;
361 * Put the new EEH PE into hierarchy tree. If the parent
362 * can't be found, the newly created PE will be attached
363 * to PHB directly. Otherwise, we have to associate the
364 * PE with its parent.
366 if (!new_pe_parent
) {
367 new_pe_parent
= eeh_phb_pe_get(hose
);
368 if (!new_pe_parent
) {
369 pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
370 __func__
, hose
->global_number
);
377 /* link new PE into the tree */
378 pe
->parent
= new_pe_parent
;
379 list_add_tail(&pe
->child
, &new_pe_parent
->child_list
);
382 * Put the newly created PE into the child list and
383 * link the EEH device accordingly.
385 list_add_tail(&edev
->entry
, &pe
->edevs
);
387 eeh_edev_dbg(edev
, "Added to new (parent: PE#%x)\n",
388 new_pe_parent
->addr
);
394 * eeh_pe_tree_remove - Remove one EEH device from the associated PE
397 * The PE hierarchy tree might be changed when doing PCI hotplug.
398 * Also, the PCI devices or buses could be removed from the system
399 * during EEH recovery. So we have to call the function remove the
400 * corresponding PE accordingly if necessary.
402 int eeh_pe_tree_remove(struct eeh_dev
*edev
)
404 struct eeh_pe
*pe
, *parent
, *child
;
408 pe
= eeh_dev_to_pe(edev
);
410 eeh_edev_dbg(edev
, "No PE found for device.\n");
414 /* Remove the EEH device */
416 list_del(&edev
->entry
);
419 * Check if the parent PE includes any EEH devices.
420 * If not, we should delete that. Also, we should
421 * delete the parent PE if it doesn't have associated
422 * child PEs and EEH devices.
427 /* PHB PEs should never be removed */
428 if (pe
->type
& EEH_PE_PHB
)
432 * XXX: KEEP is set while resetting a PE. I don't think it's
433 * ever set without RECOVERING also being set. I could
434 * be wrong though so catch that with a WARN.
436 keep
= !!(pe
->state
& EEH_PE_KEEP
);
437 recover
= !!(pe
->state
& EEH_PE_RECOVERING
);
438 WARN_ON(keep
&& !recover
);
440 if (!keep
&& !recover
) {
441 if (list_empty(&pe
->edevs
) &&
442 list_empty(&pe
->child_list
)) {
443 list_del(&pe
->child
);
450 * Mark the PE as invalid. At the end of the recovery
451 * process any invalid PEs will be garbage collected.
453 * We need to delay the free()ing of them since we can
454 * remove edev's while traversing the PE tree which
455 * might trigger the removal of a PE and we can't
456 * deal with that (yet).
458 if (list_empty(&pe
->edevs
)) {
460 list_for_each_entry(child
, &pe
->child_list
, child
) {
461 if (!(child
->type
& EEH_PE_INVALID
)) {
468 pe
->type
|= EEH_PE_INVALID
;
481 * eeh_pe_update_time_stamp - Update PE's frozen time stamp
484 * We have time stamp for each PE to trace its time of getting
485 * frozen in last hour. The function should be called to update
486 * the time stamp on first error of the specific PE. On the other
487 * handle, we needn't account for errors happened in last hour.
489 void eeh_pe_update_time_stamp(struct eeh_pe
*pe
)
495 if (pe
->freeze_count
<= 0) {
496 pe
->freeze_count
= 0;
497 pe
->tstamp
= ktime_get_seconds();
499 tstamp
= ktime_get_seconds();
500 if (tstamp
- pe
->tstamp
> 3600) {
502 pe
->freeze_count
= 0;
508 * eeh_pe_state_mark - Mark specified state for PE and its associated device
511 * EEH error affects the current PE and its child PEs. The function
512 * is used to mark appropriate state for the affected PEs and the
513 * associated devices.
515 void eeh_pe_state_mark(struct eeh_pe
*root
, int state
)
519 eeh_for_each_pe(root
, pe
)
520 if (!(pe
->state
& EEH_PE_REMOVED
))
523 EXPORT_SYMBOL_GPL(eeh_pe_state_mark
);
526 * eeh_pe_mark_isolated
529 * Record that a PE has been isolated by marking the PE and it's children as
530 * EEH_PE_ISOLATED (and EEH_PE_CFG_BLOCKED, if required) and their PCI devices
531 * as pci_channel_io_frozen.
533 void eeh_pe_mark_isolated(struct eeh_pe
*root
)
536 struct eeh_dev
*edev
;
537 struct pci_dev
*pdev
;
539 eeh_pe_state_mark(root
, EEH_PE_ISOLATED
);
540 eeh_for_each_pe(root
, pe
) {
541 list_for_each_entry(edev
, &pe
->edevs
, entry
) {
542 pdev
= eeh_dev_to_pci_dev(edev
);
544 pdev
->error_state
= pci_channel_io_frozen
;
546 /* Block PCI config access if required */
547 if (pe
->state
& EEH_PE_CFG_RESTRICTED
)
548 pe
->state
|= EEH_PE_CFG_BLOCKED
;
551 EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated
);
553 static void __eeh_pe_dev_mode_mark(struct eeh_dev
*edev
, void *flag
)
555 int mode
= *((int *)flag
);
561 * eeh_pe_dev_state_mark - Mark state for all device under the PE
564 * Mark specific state for all child devices of the PE.
566 void eeh_pe_dev_mode_mark(struct eeh_pe
*pe
, int mode
)
568 eeh_pe_dev_traverse(pe
, __eeh_pe_dev_mode_mark
, &mode
);
572 * eeh_pe_state_clear - Clear state for the PE
575 * @include_passed: include passed-through devices?
577 * The function is used to clear the indicated state from the
578 * given PE. Besides, we also clear the check count of the PE
581 void eeh_pe_state_clear(struct eeh_pe
*root
, int state
, bool include_passed
)
584 struct eeh_dev
*edev
, *tmp
;
585 struct pci_dev
*pdev
;
587 eeh_for_each_pe(root
, pe
) {
588 /* Keep the state of permanently removed PE intact */
589 if (pe
->state
& EEH_PE_REMOVED
)
592 if (!include_passed
&& eeh_pe_passed(pe
))
598 * Special treatment on clearing isolated state. Clear
599 * check count since last isolation and put all affected
600 * devices to normal state.
602 if (!(state
& EEH_PE_ISOLATED
))
606 eeh_pe_for_each_dev(pe
, edev
, tmp
) {
607 pdev
= eeh_dev_to_pci_dev(edev
);
611 pdev
->error_state
= pci_channel_io_normal
;
614 /* Unblock PCI config access if required */
615 if (pe
->state
& EEH_PE_CFG_RESTRICTED
)
616 pe
->state
&= ~EEH_PE_CFG_BLOCKED
;
621 * Some PCI bridges (e.g. PLX bridges) have primary/secondary
622 * buses assigned explicitly by firmware, and we probably have
623 * lost that after reset. So we have to delay the check until
624 * the PCI-CFG registers have been restored for the parent
627 * Don't use normal PCI-CFG accessors, which probably has been
628 * blocked on normal path during the stage. So we need utilize
629 * eeh operations, which is always permitted.
631 static void eeh_bridge_check_link(struct eeh_dev
*edev
)
638 * We only check root port and downstream ports of
641 if (!(edev
->mode
& (EEH_DEV_ROOT_PORT
| EEH_DEV_DS_PORT
)))
644 eeh_edev_dbg(edev
, "Checking PCIe link...\n");
646 /* Check slot status */
647 cap
= edev
->pcie_cap
;
648 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_SLTSTA
, 2, &val
);
649 if (!(val
& PCI_EXP_SLTSTA_PDS
)) {
650 eeh_edev_dbg(edev
, "No card in the slot (0x%04x) !\n", val
);
654 /* Check power status if we have the capability */
655 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_SLTCAP
, 2, &val
);
656 if (val
& PCI_EXP_SLTCAP_PCP
) {
657 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_SLTCTL
, 2, &val
);
658 if (val
& PCI_EXP_SLTCTL_PCC
) {
659 eeh_edev_dbg(edev
, "In power-off state, power it on ...\n");
660 val
&= ~(PCI_EXP_SLTCTL_PCC
| PCI_EXP_SLTCTL_PIC
);
661 val
|= (0x0100 & PCI_EXP_SLTCTL_PIC
);
662 eeh_ops
->write_config(edev
, cap
+ PCI_EXP_SLTCTL
, 2, val
);
668 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_LNKCTL
, 2, &val
);
669 val
&= ~PCI_EXP_LNKCTL_LD
;
670 eeh_ops
->write_config(edev
, cap
+ PCI_EXP_LNKCTL
, 2, val
);
673 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_LNKCAP
, 4, &val
);
674 if (!(val
& PCI_EXP_LNKCAP_DLLLARC
)) {
675 eeh_edev_dbg(edev
, "No link reporting capability (0x%08x) \n", val
);
680 /* Wait the link is up until timeout (5s) */
682 while (timeout
< 5000) {
686 eeh_ops
->read_config(edev
, cap
+ PCI_EXP_LNKSTA
, 2, &val
);
687 if (val
& PCI_EXP_LNKSTA_DLLLA
)
691 if (val
& PCI_EXP_LNKSTA_DLLLA
)
692 eeh_edev_dbg(edev
, "Link up (%s)\n",
693 (val
& PCI_EXP_LNKSTA_CLS_2_5GB
) ? "2.5GB" : "5GB");
695 eeh_edev_dbg(edev
, "Link not ready (0x%04x)\n", val
);
698 #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
699 #define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
701 static void eeh_restore_bridge_bars(struct eeh_dev
*edev
)
706 * Device BARs: 0x10 - 0x18
707 * Bus numbers and windows: 0x18 - 0x30
709 for (i
= 4; i
< 13; i
++)
710 eeh_ops
->write_config(edev
, i
*4, 4, edev
->config_space
[i
]);
712 eeh_ops
->write_config(edev
, 14*4, 4, edev
->config_space
[14]);
714 /* Cache line & Latency timer: 0xC 0xD */
715 eeh_ops
->write_config(edev
, PCI_CACHE_LINE_SIZE
, 1,
716 SAVED_BYTE(PCI_CACHE_LINE_SIZE
));
717 eeh_ops
->write_config(edev
, PCI_LATENCY_TIMER
, 1,
718 SAVED_BYTE(PCI_LATENCY_TIMER
));
719 /* Max latency, min grant, interrupt ping and line: 0x3C */
720 eeh_ops
->write_config(edev
, 15*4, 4, edev
->config_space
[15]);
722 /* PCI Command: 0x4 */
723 eeh_ops
->write_config(edev
, PCI_COMMAND
, 4, edev
->config_space
[1] |
724 PCI_COMMAND_MEMORY
| PCI_COMMAND_MASTER
);
726 /* Check the PCIe link is ready */
727 eeh_bridge_check_link(edev
);
730 static void eeh_restore_device_bars(struct eeh_dev
*edev
)
735 for (i
= 4; i
< 10; i
++)
736 eeh_ops
->write_config(edev
, i
*4, 4, edev
->config_space
[i
]);
737 /* 12 == Expansion ROM Address */
738 eeh_ops
->write_config(edev
, 12*4, 4, edev
->config_space
[12]);
740 eeh_ops
->write_config(edev
, PCI_CACHE_LINE_SIZE
, 1,
741 SAVED_BYTE(PCI_CACHE_LINE_SIZE
));
742 eeh_ops
->write_config(edev
, PCI_LATENCY_TIMER
, 1,
743 SAVED_BYTE(PCI_LATENCY_TIMER
));
745 /* max latency, min grant, interrupt pin and line */
746 eeh_ops
->write_config(edev
, 15*4, 4, edev
->config_space
[15]);
749 * Restore PERR & SERR bits, some devices require it,
750 * don't touch the other command bits
752 eeh_ops
->read_config(edev
, PCI_COMMAND
, 4, &cmd
);
753 if (edev
->config_space
[1] & PCI_COMMAND_PARITY
)
754 cmd
|= PCI_COMMAND_PARITY
;
756 cmd
&= ~PCI_COMMAND_PARITY
;
757 if (edev
->config_space
[1] & PCI_COMMAND_SERR
)
758 cmd
|= PCI_COMMAND_SERR
;
760 cmd
&= ~PCI_COMMAND_SERR
;
761 eeh_ops
->write_config(edev
, PCI_COMMAND
, 4, cmd
);
765 * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
769 * Loads the PCI configuration space base address registers,
770 * the expansion ROM base address, the latency timer, and etc.
771 * from the saved values in the device node.
773 static void eeh_restore_one_device_bars(struct eeh_dev
*edev
, void *flag
)
775 /* Do special restore for bridges */
776 if (edev
->mode
& EEH_DEV_BRIDGE
)
777 eeh_restore_bridge_bars(edev
);
779 eeh_restore_device_bars(edev
);
781 if (eeh_ops
->restore_config
)
782 eeh_ops
->restore_config(edev
);
786 * eeh_pe_restore_bars - Restore the PCI config space info
789 * This routine performs a recursive walk to the children
790 * of this device as well.
792 void eeh_pe_restore_bars(struct eeh_pe
*pe
)
795 * We needn't take the EEH lock since eeh_pe_dev_traverse()
798 eeh_pe_dev_traverse(pe
, eeh_restore_one_device_bars
, NULL
);
802 * eeh_pe_loc_get - Retrieve location code binding to the given PE
805 * Retrieve the location code of the given PE. If the primary PE bus
806 * is root bus, we will grab location code from PHB device tree node
807 * or root port. Otherwise, the upstream bridge's device tree node
808 * of the primary PE bus will be checked for the location code.
810 const char *eeh_pe_loc_get(struct eeh_pe
*pe
)
812 struct pci_bus
*bus
= eeh_pe_bus_get(pe
);
813 struct device_node
*dn
;
814 const char *loc
= NULL
;
817 dn
= pci_bus_to_OF_node(bus
);
823 if (pci_is_root_bus(bus
))
824 loc
= of_get_property(dn
, "ibm,io-base-loc-code", NULL
);
826 loc
= of_get_property(dn
, "ibm,slot-location-code",
839 * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
842 * Retrieve the PCI bus according to the given PE. Basically,
843 * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
844 * primary PCI bus will be retrieved. The parent bus will be
845 * returned for BUS PE. However, we don't have associated PCI
848 struct pci_bus
*eeh_pe_bus_get(struct eeh_pe
*pe
)
850 struct eeh_dev
*edev
;
851 struct pci_dev
*pdev
;
853 if (pe
->type
& EEH_PE_PHB
)
856 /* The primary bus might be cached during probe time */
857 if (pe
->state
& EEH_PE_PRI_BUS
)
860 /* Retrieve the parent PCI bus of first (top) PCI device */
861 edev
= list_first_entry_or_null(&pe
->edevs
, struct eeh_dev
, entry
);
862 pdev
= eeh_dev_to_pci_dev(edev
);