summaryrefslogtreecommitdiffstats
path: root/hw/npu2-common.c
diff options
context:
space:
mode:
Diffstat (limited to 'hw/npu2-common.c')
-rw-r--r--hw/npu2-common.c234
1 files changed, 234 insertions, 0 deletions
diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index ccbbbbca..d4c0f851 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -103,6 +103,239 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas
(uint64_t)new_val << 32);
}
+typedef struct {
+ const char *name;
+ uint32_t block;
+ uint32_t offset;
+} npu2_scom_dump_t;
+
+static npu2_scom_dump_t npu2_scom_dump_global[] = {
+ /* CQ State Machine */
+ { "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 },
+ { "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 },
+ { "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 },
+ { "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 },
+ { "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 },
+ { "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 },
+ { "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 },
+
+ { "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 },
+ { "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 },
+
+ { "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 },
+ { "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 },
+
+ { "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 },
+ { "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 },
+
+ { "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 },
+ { "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 },
+
+ /* CQ Control */
+ { "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 },
+ { "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 },
+ { "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 },
+ { "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 },
+
+ /* CQ Data */
+ { "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS },
+ { "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK },
+ { "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST },
+ { "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 },
+ { "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_nvlink[] = {
+ { "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF },
+ { "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF },
+ { "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF },
+};
+
+static npu2_scom_dump_t npu2_scom_dump_ocapi[] = {
+ { "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 },
+ { "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 },
+ { "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG },
+ { "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG },
+ { "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 },
+ { "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 },
+};
+
+static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack)
+{
+ uint64_t reg, val;
+
+ reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset);
+ val = npu2_scom_read(npu->chip_id, npu->xscom_base,
+ reg, NPU2_MISC_DA_LEN_8B);
+
+ prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n",
+ npu->chip_id, stack - 4, scom->name, reg, val);
+}
+
+/* same as above, but for direct access registers */
+static void print_one_reg(int chip_id, int brick_index,
+ uint64_t reg_addr, const char *reg_name)
+{
+ uint64_t val;
+
+ xscom_read(chip_id, reg_addr, &val);
+ prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n",
+ chip_id, reg_name, brick_index, reg_addr, val);
+}
+
+static void show_nvlink_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, ntl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2;
+
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) {
+ if (npu2_scom_dump_nvlink[i].block == ntl)
+ print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack);
+ }
+}
+
+static void show_opencapi_regs(struct npu2 *npu, int brick_index)
+{
+ uint32_t stack, otl;
+ int i;
+
+ stack = NPU2_STACK_STCK_0 + brick_index / 2;
+ otl = NPU2_BLOCK_OTL0 + (brick_index % 2);
+
+ /* NPU registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) {
+ if (npu2_scom_dump_ocapi[i].block == otl)
+ print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack);
+ }
+
+ /* Fabric registers */
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_STATUS(brick_index), "ODL status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_TRAINING_STATUS(brick_index), "ODL training status");
+ print_one_reg(npu->chip_id, brick_index,
+ OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info");
+}
+
+static void show_all_regs(struct npu2 *npu, int brick_index)
+{
+ int i, stack, stack_min, stack_max;
+ uint64_t fir_val, mask_val, fir_addr, mask_addr;
+ struct npu2_dev *dev;
+ npu2_scom_dump_t scom_reg;
+
+ if (brick_index != -1) {
+ stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2;
+ } else {
+ stack_min = NPU2_STACK_STCK_0;
+ stack_max = NPU2_STACK_STCK_2;
+ /* Avoid dumping unused stacks for opencapi on Lagrange */
+ if (npu->total_devices == 2)
+ stack_min = stack_max = NPU2_STACK_STCK_1;
+ }
+
+ /* NPU FIRs */
+ for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) {
+ fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET;
+ mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET;
+ xscom_read(npu->chip_id, fir_addr, &fir_val);
+ xscom_read(npu->chip_id, mask_addr, &mask_val);
+ prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n",
+ npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val);
+ }
+
+ /* NPU global, per-stack registers */
+ for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) {
+ for (stack = stack_min; stack <= stack_max; stack++)
+ print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack);
+ }
+
+ /*
+ * NPU global registers, stack independent
+ * We have only one for now, so dump it directly
+ */
+ scom_reg.name = "XTS.REG.ERR_HOLD";
+ scom_reg.block = NPU2_BLOCK_XTS;
+ scom_reg.offset = 0;
+ print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC);
+
+ /* nvlink- or opencapi-specific registers */
+ for (i = 0; i < npu->total_devices; i++) {
+ dev = &npu->devices[i];
+ if (brick_index == -1 || dev->brick_index == brick_index) {
+ if (dev->type == NPU2_DEV_TYPE_NVLINK)
+ show_nvlink_regs(npu, dev->brick_index);
+ else if (dev->type == NPU2_DEV_TYPE_OPENCAPI)
+ show_opencapi_regs(npu, dev->brick_index);
+ }
+ }
+}
+
+void npu2_dump_scoms(int chip_id)
+{
+ struct npu2 *npu;
+ struct phb *phb;
+ struct npu2_dev *dev;
+
+ /*
+ * Look for the npu2 structure for that chip ID. We can access it
+ * through the array of phbs, looking for a nvlink or opencapi
+ * phb. We can have several entries, but they all point
+ * to the same npu2 structure
+ */
+ for_each_phb(phb) {
+ npu = NULL;
+ if (phb->phb_type == phb_type_npu_v2) {
+ npu = phb_to_npu2_nvlink(phb);
+ } else if (phb->phb_type == phb_type_npu_v2_opencapi) {
+ dev = phb_to_npu2_dev_ocapi(phb);
+ npu = dev->npu;
+ }
+ if (npu && npu->chip_id == chip_id) {
+ show_all_regs(npu, -1 /* all bricks */);
+ break;
+ }
+ }
+}
+
static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
{
struct npu2 *p = is->data;
@@ -182,6 +415,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
brick = 2 + ((idx - 27) % 4);
prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
p->chip_id, brick);
+ show_all_regs(p, brick);
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
break;
OpenPOWER on IntegriCloud