summaryrefslogtreecommitdiffstats
path: root/hw
diff options
context:
space:
mode:
authorFrederic Barrat <fbarrat@linux.ibm.com>2019-04-05 16:33:03 +0200
committerStewart Smith <stewart@linux.ibm.com>2019-04-09 10:50:55 +1000
commitd1f3e4faf9d99d76bc413503afea87c8486af8b1 (patch)
tree5ae00584124d92e2193762e100abe289c26c23e6 /hw
parent7320a21e7261d5ed87971a7985fecdd7588a72ec (diff)
downloadtalos-skiboot-d1f3e4faf9d99d76bc413503afea87c8486af8b1.tar.gz
talos-skiboot-d1f3e4faf9d99d76bc413503afea87c8486af8b1.zip
hw/npu2: Dump (more) npu2 registers on link error and HMIs
We were already logging some NPU registers during an HMI. This patch cleans up a bit how it is done and separates what is global from what is specific to nvlink or opencapi. Since we can now receive an error interrupt when an opencapi link goes down unexpectedly, we also dump the NPU state but we limit it to the registers of the brick which hit the error. The list of registers to dump was worked out with the hw team to allow for proper debugging. For each register, we print the name as found in the NPU workbook, the scom address and the register value. Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
Diffstat (limited to 'hw')
-rw-r--r--hw/npu2-common.c234
1 files changed, 234 insertions, 0 deletions
diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index ccbbbbca..d4c0f851 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -103,6 +103,239 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas
(uint64_t)new_val << 32);
}
+typedef struct {
+ const char *name;
+ uint32_t block;
+ uint32_t offset;
+} npu2_scom_dump_t;
+
+static npu2_scom_dump_t npu2_scom_dump_global[] = {
+ /* CQ State Machine */
+ { "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 },
+
+ { "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 },
+
+ { "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 },
+
+ { "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 },
+
+ /* CQ Control */
+ { "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 },
+ { "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 },
+ { "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 },
+ { "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 },
+
+ /* CQ Data */
+ { "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS },
+ { "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK },
+ { "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST },
+ { "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 },
+ { "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
+ { "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF },
+ { "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
+ { "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 },
+};
+
+static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
+{
+ uint64_t reg, val;
+
+ reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
+ val = npu2_scom_read(npu->chip_id, npu->xscom_base,
+ reg, NPU2_MISC_DA_LEN_8B);
+
+ prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
+ npu->chip_id, stack - 4, scom->name, reg, val);
+}
+
+/* same as above, but for direct access registers */
+static void print_one_reg(int chip_id, int brick_index,
+ uint64_t reg_addr, const char *reg_name)
+{
+ uint64_t val;
+
+ xscom_read(chip_id, reg_addr, &val);
+ prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n",
+ chip_id, reg_name, brick_index, reg_addr, val);
+}
+
+static void show_nvlink_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, ntl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
+
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
+ if (npu2_scom_dump_nvlink[i].block == ntl)
+ print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack);
+ }
+}
+
+static void show_opencapi_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, otl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
+
+ /* NPU registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
+ if (npu2_scom_dump_ocapi[i].block == otl)
+ print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack);
+ }
+
+ /* Fabric registers */
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_STATUS(brick_index), "ODL status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_TRAINING_STATUS(brick_index), "ODL training status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info");
+}
+
+static void show_all_regs(struct npu2 *npu, int brick_index)
+{
+ int i, stack, stack_min, stack_max;
+ uint64_t fir_val, mask_val, fir_addr, mask_addr;
+ struct npu2_dev *dev;
+ npu2_scom_dump_t scom_reg;
+
+ if (brick_index != -1) {
+ stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
+ } else {
+ stack_min = NPU2_STACK_STCK_0;
+ stack_max = NPU2_STACK_STCK_2;
+ /* Avoid dumping unused stacks for opencapi on Lagrange */
+ if (npu->total_devices == 2)
+ stack_min = stack_max = NPU2_STACK_STCK_1;
+ }
+
+ /* NPU FIRs */
+ for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+ fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
+ mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
+ xscom_read(npu->chip_id, fir_addr, &fir_val);
+ xscom_read(npu->chip_id, mask_addr, &mask_val);
+ prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
+ npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
+ }
+
+ /* NPU global, per-stack registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
+ for (stack = stack_min; stack <= stack_max; stack++)
+ print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack);
+ }
+
+ /*
+ * NPU global registers, stack independent
+ * We have only one for now, so dump it directly
+ */
+ scom_reg.name = "XTS.REG.ERR_HOLD";
+ scom_reg.block = NPU2_BLOCK_XTS;
+ scom_reg.offset = 0;
+ print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC);
+
+ /* nvlink- or opencapi-specific registers */
+ for (i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (brick_index == -1 || dev->brick_index == brick_index) {
+ if (dev->type == NPU2_DEV_TYPE_NVLINK)
+ show_nvlink_regs(npu, dev->brick_index);
+ else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
+ show_opencapi_regs(npu, dev->brick_index);
+ }
+ }
+}
+
+void npu2_dump_scoms(int chip_id)
+{
+ struct npu2 *npu;
+ struct phb *phb;
+ struct npu2_dev *dev;
+
+ /*
+ * Look for the npu2 structure for that chip ID. We can access it
+ * through the array of phbs, looking for a nvlink or opencapi
+ * phb. We can have several entries, but they all point
+ * to the same npu2 structure
+ */
+ for_each_phb(phb) {
+ npu = NULL;
+ if (phb->phb_type == phb_type_npu_v2) {
+ npu = phb_to_npu2_nvlink(phb);
+ } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
+ dev = phb_to_npu2_dev_ocapi(phb);
+ npu = dev->npu;
+ }
+ if (npu && npu->chip_id == chip_id) {
+ show_all_regs(npu, -1 /* all bricks */);
+ break;
+ }
+ }
+}
+
static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
{
struct npu2 *p = is->data;
@@ -182,6 +415,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
brick = 2 + ((idx - 27) % 4);
prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
p->chip_id, brick);
+ show_all_regs(p, brick);
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
break;
OpenPOWER on IntegriCloud