From 89d1e09b6a6d844ef327937f41658a426be42501 Mon Sep 17 00:00:00 2001 From: Bryan O'Sullivan Date: Thu, 28 Sep 2006 09:00:18 -0700 Subject: IB/ipath: Fix and recover TXE piobuf and PBC parity errors We can sometimes trigger parity errors due to processor speculative reads to our write-combined memory (mostly seen on Woodcrest). Add a stats counter for these. Factored out the sendbuffererror buffer cancellation code so it can be used in the new handling; suppress likely subsequent error messages if within two jiffies of the cancellation. Also restore 2 dropped TXE lines on hwe_bitsextant noticed while debugging. Signed-off-by: Bryan O'Sullivan Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_iba6120.c | 37 ++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/hw/ipath/ipath_iba6120.c') diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 08a44dd9ed6f..024b6aa320f1 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c @@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, * make sure we get this much out, unless told to be quiet, * or it's occurred within the last 5 seconds */ - if ((hwerrs & ~dd->ipath_lasthwerror) || + if ((hwerrs & ~(dd->ipath_lasthwerror | + ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) || (ipath_debug & __IPATH_VERBDBG)) dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " "(cleared)\n", (unsigned long long) hwerrs); @@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); if (ctrl & INFINIPATH_C_FREEZEMODE) { + /* + * parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { + ipath_stats.sps_txeparity++; + ipath_dbg("Recovering from TXE parity error (%llu), " + "hwerrstatus=%llx\n", + (unsigned long long) ipath_stats.sps_txeparity, + (unsigned long long) hwerrs); + ipath_disarm_senderrbufs(dd); + hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); + if (!hwerrs) { /* else leave in freeze mode */ + ipath_write_kreg(dd, + dd->ipath_kregs->kr_control, + dd->ipath_control); + return; + } + } if (hwerrs) { /* * if any set that we aren't ignoring only make the @@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, } else { ipath_dbg("Clearing freezemode on ignored hardware " "error\n"); - ctrl &= ~INFINIPATH_C_FREEZEMODE; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, - ctrl); + dd->ipath_control); } } @@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd) dd->ipath_hwe_bitsextant = (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | INFINIPATH_HWE_PCIE1PLLFAILED | -- cgit v1.2.1