summaryrefslogtreecommitdiffstats
path: root/hw/phb4.c
diff options
context:
space:
mode:
authorOliver O'Halloran <oohall@gmail.com>2018-10-30 11:02:30 +1100
committerStewart Smith <stewart@linux.ibm.com>2018-11-01 23:56:44 -0500
commit9597a12ef4b3644e4b8644f659bec04ca139b7f9 (patch)
treebc74bf641e0e0b597df9e646637ffe9012d8fc4d /hw/phb4.c
parentcee7ec9eae090e3e9ca6b8f07f40a9ae8164ecab (diff)
downloadblackbird-skiboot-9597a12ef4b3644e4b8644f659bec04ca139b7f9.tar.gz
blackbird-skiboot-9597a12ef4b3644e4b8644f659bec04ca139b7f9.zip
phb4: Check for RX errors after link training
Some PHB4 PHYs can get stuck in a bad state where they are constantly retraining the link. This happens transparently to skiboot and Linux but will causes PCIe to be slow. Resetting the PHB4 clears the problem. We can detect this case by looking at the RX errors count where we check for link stability. This patch does this by modifying the link optimal code to check for RX errors. If errors are occurring we retrain the link irrespective of the chip rev or card. Normally when this problem occurs, the RX error count is maxed out at 255. When there is no problem, the count is 0. We chose 8 as the max rx errors value to give us some margin for a few errors. There is also a knob that can be used to set the error threshold for when we should retrain the link. ie nvram -p ibm,skiboot --update-config phb-rx-err-max=8 Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
Diffstat (limited to 'hw/phb4.c')
-rw-r--r--hw/phb4.c29
1 files changed, 26 insertions, 3 deletions
diff --git a/hw/phb4.c b/hw/phb4.c
index 67983634..5578cb42 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -152,6 +152,7 @@ static bool verbose_eeh;
static bool pci_tracing;
static bool pci_eeh_mmio;
static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
/* Note: The "ASB" name is historical, practically this means access via
* the XSCOM backdoor
@@ -2672,11 +2673,12 @@ static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
{
struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
uint32_t id;
- uint16_t bdfn;
- uint8_t trained_speed, phb_speed, dev_speed, target_speed;
+ uint16_t bdfn, lane_errs;
+ uint8_t trained_speed, phb_speed, dev_speed, target_speed, rx_errs;
uint8_t trained_width, phb_width, dev_width, target_width;
- bool optimal_speed, optimal_width, optimal, retry_enabled;
+ bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
/* Current trained state */
@@ -2702,6 +2704,11 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
retry_enabled = (phb4_chip_retry_workaround() &&
phb4_adapter_in_whitelist(id)) ||
phb4_lane_eq_retry_whitelist(id);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+ rx_errs = GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+ rx_err_ok = (rx_errs < rx_err_max);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+ lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
DEVICE(id), optimal ? "Optimal" : "Degraded",
@@ -2710,10 +2717,16 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
trained_speed, phb_speed, dev_speed, optimal_speed ? "" : " *");
PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
trained_width, phb_width, dev_width, optimal_width ? "" : " *");
+ PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+ rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
if (vdid)
*vdid = id;
+ /* Always do RX error retry irrespective of chip and card */
+ if (!rx_err_ok)
+ return false;
+
if (!retry_enabled)
return true;
@@ -5778,6 +5791,7 @@ static void phb4_probe_pbcq(struct dt_node *pbcq)
void probe_phb4(void)
{
struct dt_node *np;
+ const char *s;
verbose_eeh = nvram_query_eq("pci-eeh-verbose", "true");
/* REMOVEME: force this for now until we stabalise PCIe */
@@ -5788,6 +5802,15 @@ void probe_phb4(void)
pci_tracing = nvram_query_eq("pci-tracing", "true");
pci_eeh_mmio = !nvram_query_eq("pci-eeh-mmio", "disabled");
pci_retry_all = nvram_query_eq("pci-retry-all", "true");
+ s = nvram_query("phb-rx-err-max");
+ if (s) {
+ rx_err_max = atoi(s);
+
+ /* Clip to uint8_t used by hardware */
+ rx_err_max = MAX(rx_err_max, 0);
+ rx_err_max = MIN(rx_err_max, 255);
+ }
+ prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
/* Look for PBCQ XSCOM nodes */
dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
OpenPOWER on IntegriCloud