diff options
-rw-r--r-- | core/init.c | 3 | ||||
-rw-r--r-- | hw/Makefile.inc | 2 | ||||
-rw-r--r-- | hw/npu-hw-procedures.c | 598 | ||||
-rw-r--r-- | hw/npu.c | 1718 | ||||
-rw-r--r-- | include/npu-regs.h | 235 | ||||
-rw-r--r-- | include/npu.h | 211 | ||||
-rw-r--r-- | include/skiboot.h | 2 |
7 files changed, 2767 insertions, 2 deletions
diff --git a/core/init.c b/core/init.c index 7ae4deef..6d21b55b 100644 --- a/core/init.c +++ b/core/init.c @@ -740,6 +740,9 @@ void __noreturn main_cpu_entry(const void *fdt, u32 master_cpu) /* Probe PHB3 on P8 */ probe_phb3(); + /* Probe NPUs */ + probe_npu(); + /* Initialize PCI */ pci_init_slots(); diff --git a/hw/Makefile.inc b/hw/Makefile.inc index 034947c0..6eacb749 100644 --- a/hw/Makefile.inc +++ b/hw/Makefile.inc @@ -6,7 +6,7 @@ HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-842.o HW_OBJS += p7ioc.o p7ioc-inits.o p7ioc-phb.o p5ioc2.o p5ioc2-phb.o HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o -HW_OBJS += dts.o lpc-rtc.o +HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o HW=hw/built-in.o include $(SRC)/hw/fsp/Makefile.inc diff --git a/hw/npu-hw-procedures.c b/hw/npu-hw-procedures.c new file mode 100644 index 00000000..118ed6d7 --- /dev/null +++ b/hw/npu-hw-procedures.c @@ -0,0 +1,598 @@ +/* Copyright 2013-2015 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <interrupts.h> +#include <lock.h> +#include <npu-regs.h> +#include <npu.h> +#include <xscom.h> + +typedef uint32_t (*step)(struct npu_dev *); + +struct procedure { + const char *name; + step steps[]; +}; + +#define DEFINE_PROCEDURE(NAME, STEPS...) \ + struct procedure procedure_##NAME = \ + {.name = #NAME, .steps = {NAME, ##STEPS}} + +#define PROCEDURE_INPROGRESS (1 << 31) +#define PROCEDURE_COMPLETE (1 << 30) +#define PROCEDURE_NEXT (1 << 29) +#define PROCEDURE_FAILED 2 +#define PROCEDURE_ABORTED 3 +#define PROCEDURE_UNSUPPORTED 4 + +/* Mask defining which status bits we want to expose */ +#define PROCEDURE_STATUS_MASK 0xc000000f + +/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */ +static bool pl_use_scom = 1; +static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val) +{ + if (pl_use_scom) + xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val); + else + out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val); +} + +static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr) +{ + uint64_t val; + + if (pl_use_scom) + xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val); + else + val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr)); + + return val & 0xffff; +} + +/* The DL registers can be accessed indirectly via the NTL */ +static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val) +{ + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, val); +} + +static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr) +{ + uint64_t val; + + xscom_write(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_ADDR, addr); + xscom_read(npu_dev->npu->chip_id, + npu_dev->xscom + NX_DL_REG_DATA, &val); + return val; +} + +/* Our hardware bits are backwards here. The lane vectors are 16-bit + * values represented in IBM bit ordering. This means lane 0 is + * represented by bit 15 in most of the registers. Internally we keep + * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane + * numbering for set_lane_reg() anyway. */ +static uint32_t phy_lane_mask(struct npu_dev *npu_dev) +{ + /* We only train 8 lanes at a time so we don't do a full + * bit-swap */ + assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff); + + return ~npu_dev->lane_mask & 0xffff; +} + +static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg, + uint64_t data, uint64_t mask) +{ + uint64_t val, i; + uint32_t lane_mask = npu_dev->lane_mask; + + for (i = 0; i <= 23; i++) { + if (lane_mask & (1ul << i)) { + uint64_t tx_rxcal_reg = base_reg + (i << 32); + val = phy_read(npu_dev, tx_rxcal_reg); + val = (val & ~mask) | data; + phy_write(npu_dev, tx_rxcal_reg, val); + } + } +} + +static uint32_t stop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE | PROCEDURE_ABORTED; +} +DEFINE_PROCEDURE(stop); + +static uint32_t nop(struct npu_dev *npu_dev __unused) +{ + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(nop); + +/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also + * incorporates AT reset. */ +static uint32_t reset_npu_dl(struct npu_dev *npu_dev) +{ + void *ntl_base = (void *) npu_dev->bar.base; + uint64_t val; + + /* Assert NPU reset */ + val = in_be64(ntl_base + NTL_CONTROL); + val |= NTL_CONTROL_RESET; + out_be64(ntl_base + NTL_CONTROL, val); + + /* Put the Nvidia logic in reset */ + dl_write(npu_dev, NDL_CONTROL, 0xe8000000); + + /* Release Nvidia logic from reset */ + dl_write(npu_dev, NDL_CONTROL, 0); + + /* Release NPU from reset */ + val &= ~NTL_CONTROL_RESET; + out_be64(ntl_base + NTL_CONTROL, val); + + /* Setup up TL credits */ + out_be64(ntl_base + TL_CMD_CR, PPC_BIT(0)); + out_be64(ntl_base + TL_CMD_D_CR, PPC_BIT(0)); + out_be64(ntl_base + TL_RSP_CR, PPC_BIT(15)); + out_be64(ntl_base + TL_RSP_D_CR, PPC_BIT(15)); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(reset_npu_dl); + +/* Procedures 1.2.3 (reset_lanes) & 1.2.4 + * (io_register_write_reset_values) */ +static uint32_t phy_reset(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Lower run_lane inputs for lanes to be reset */ + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_wait(struct npu_dev *npu_dev) +{ + uint16_t val; + + /* Wait for lane busy outputs to go to zero for lanes to be + * reset */ + val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15); + if (val & phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_reset_complete(struct npu_dev *npu_dev) +{ + uint16_t val; + uint32_t lane_mask = phy_lane_mask(npu_dev); + + /* Set ioreset_vec for the desired lanes bit positions */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask); + + /* Clear ioreset_vec */ + val = phy_read(npu_dev, RX_IORESET_VEC_0_15); + phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask); + + val = phy_read(npu_dev, TX_IORESET_VEC_0_15); + phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask); + + /* Reset RX phase rotators */ + set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET); + set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET); + + /* Restore registers from scominit that may have changed */ + set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP); + set_lane_reg(npu_dev, RX_A_DAC_CNTL, + 0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL), + RX_PR_IQ_RES_SEL); + set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN); + set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN); + set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete); + +/* Round a fixed decimal number. Frac is the number of fractional + * bits */ +static uint32_t round(uint32_t val, int frac) +{ + if (val >> (frac - 1) & 0x1) + return (val >> frac) + 1; + else + return val >> frac; +} + +#define ZCAL_MIN (10 << 3) +#define ZCAL_MAX (40 << 3) +#define ZCAL_K0 0x0 +#define ZCAL_M 128 +/* TODO: add a test case for the following values: + + Initial values: + zcal_n = 0xda; + zcal_p = 0xc7; + + Results: + pre_p = 0x0 + pre_n = 0x0 + margin_p = 0x0 + margin_n = 0x0 + total_en_p = 0x32 + total_en_n = 0x37 + */ + +static uint32_t phy_tx_zcal(struct npu_dev *npu_dev) +{ + uint64_t val; + + if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0]) + return PROCEDURE_COMPLETE; + + if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1]) + return PROCEDURE_COMPLETE; + + /* Start calibration */ + val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB); + val &= TX_ZCAL_SWO_EN; + phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val); + phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2); + val = phy_read(npu_dev, TX_IMPCAL_PB); + val |= TX_ZCAL_REQ; + phy_write(npu_dev, TX_IMPCAL_PB, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, TX_IMPCAL_PB); + if (!(val & TX_ZCAL_DONE)) + return PROCEDURE_INPROGRESS; + + if (val & TX_ZCAL_ERROR) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev) +{ + uint64_t val; + uint64_t zcal_n; + uint64_t zcal_p; + uint64_t margin_n; + uint64_t margin_p; + uint64_t pre_n; + uint64_t pre_p; + uint64_t total_en_n; + uint64_t total_en_p; + + val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB); + zcal_n = GETFIELD(TX_ZCAL_N, val); + val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB); + zcal_p = GETFIELD(TX_ZCAL_P, val); + + if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) || + (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX)) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + margin_n = (0x80 - ZCAL_M) * zcal_n / 2; + margin_p = (0x80 - ZCAL_M) * zcal_p / 2; + pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80; + pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80; + + total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023); + total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023); + + pre_p = round(pre_p, 9); + pre_n = round(pre_n, 9); + margin_p = round(margin_p, 9); + margin_n = round(margin_n, 9); + total_en_p = round(total_en_p, 9); + total_en_n = round(total_en_n, 9); + + val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n); + val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p); + phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val); + + val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n); + val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p); + phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val); + + val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n); + val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p); + phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val); + + if (npu_dev->index < 2) + npu_dev->npu->tx_zcal_complete[0] = true; + else + npu_dev->npu->tx_zcal_complete[1] = true; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate); + +static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode on */ + set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_enable_tx_rxcal); + +static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev) +{ + /* Turn common mode off */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_disable_tx_rxcal); + +static uint32_t phy_rx_dccal(struct npu_dev *npu_dev) +{ + if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15) + & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev) +{ + uint64_t val; + + /* Save EO step control */ + val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG); + npu_dev->procedure_data = val; + + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, + RX_EO_ENABLE_LATCH_OFFSET_CAL + | RX_EO_ENABLE_CM_COARSE_CAL); + + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev) +{ + /* Poll for completion on relevant lanes */ + if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev)) + != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_NEXT; +} + +static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev) +{ + uint64_t val; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + + /* Turn off recal abort */ + val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15); + val &= ~phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val); + + /* Restore original settings */ + phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data); + + /* FIFO Init */ + set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE); + set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT); + set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE, + TX_UNLOAD_CLK_DISABLE); + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete, + phy_rx_dccal_fifo_init); + +static uint32_t phy_rx_training(struct npu_dev *npu_dev) +{ + uint16_t val; + + if (!npu_dev->procedure_data) { + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + val |= phy_lane_mask(npu_dev); + phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val); + } + + npu_dev->procedure_data++; + if (npu_dev->procedure_data >= 1000000) + return PROCEDURE_COMPLETE | PROCEDURE_FAILED; + + val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15); + if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev)) + return PROCEDURE_INPROGRESS; + + return PROCEDURE_COMPLETE; +} +DEFINE_PROCEDURE(phy_rx_training); + +static struct procedure *npu_procedures[] = { + &procedure_stop, + &procedure_nop, + NULL, + NULL, + &procedure_phy_reset, + &procedure_phy_tx_zcal, + &procedure_phy_rx_dccal, + &procedure_phy_enable_tx_rxcal, + &procedure_phy_disable_tx_rxcal, + &procedure_phy_rx_training, + &procedure_reset_npu_dl, + + /* Place holders for pre-terminate and terminate procedures */ + &procedure_nop, + &procedure_nop}; + +/* Run a procedure step(s) and return status */ +static uint32_t get_procedure_status(struct npu_dev *dev) +{ + uint32_t result; + uint16_t procedure = dev->procedure_number; + uint16_t step = dev->procedure_step; + const char *name = npu_procedures[procedure]->name; + + do { + result = npu_procedures[procedure]->steps[step](dev); + + if (result & PROCEDURE_NEXT) { + step++; + NPUDEVINF(dev, "Running procedure %s step %d\n", name, step); + } + } while (result & PROCEDURE_NEXT); + + dev->procedure_step = step; + + if (result & PROCEDURE_COMPLETE) + NPUDEVINF(dev, "Procedure %s complete\n", name); + else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) { + NPUDEVINF(dev, "Procedure %s timed out\n", name); + result = PROCEDURE_COMPLETE | PROCEDURE_FAILED; + } + + /* Mask off internal state bits */ + dev->procedure_status = result & PROCEDURE_STATUS_MASK; + + return dev->procedure_status; +} + +int64_t npu_dev_procedure_read(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t *data) +{ + struct npu_dev *dev = trap->dev; + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config reads are not supported */ + NPUDEVERR(dev, "Short read of procedure register\n"); + return OPAL_PARAMETER; + } + + offset -= trap->start; + *data = 0; + + switch (offset) { + case 0: + /* Only run the procedure if not already complete */ + if (dev->procedure_status & PROCEDURE_COMPLETE) + *data = dev->procedure_status; + else + *data = get_procedure_status(dev); + + break; + + case 4: + *data = dev->procedure_number; + break; + + default: + NPUDEVERR(dev, "Invalid vendor specific offset 0x%08x\n", + offset); + rc = OPAL_PARAMETER; + } + + return rc; +} + +int64_t npu_dev_procedure_write(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t data) +{ + struct npu_dev *dev = trap->dev; + const char *name; + int64_t rc = OPAL_SUCCESS; + + if (size != 4) { + /* Short config writes are not supported */ + NPUDEVERR(dev, "Short read of procedure register\n"); + return OPAL_PARAMETER; + } + + offset -= trap->start; + + switch (offset) { + case 0: + /* We ignore writes to the status register */ + NPUDEVINF(dev, "Ignoring writes to status register\n"); + break; + + case 4: + if (data >= ARRAY_SIZE(npu_procedures) || + !npu_procedures[data]) { + NPUDEVINF(dev, "Unsupported procedure number %d\n", data); + dev->procedure_status = PROCEDURE_COMPLETE + | PROCEDURE_UNSUPPORTED; + break; + } + + name = npu_procedures[data]->name; + if (dev->procedure_number == data + && !(dev->procedure_status & PROCEDURE_COMPLETE)) + NPUDEVINF(dev, "Restarting procuedure %s\n", name); + else + NPUDEVINF(dev, "Starting procedure %s\n", name); + + dev->procedure_status = PROCEDURE_INPROGRESS; + dev->procedure_number = data; + dev->procedure_step = 0; + dev->procedure_data = 0; + dev->procedure_tb = mftb(); + break; + + default: + NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset); + rc = OPAL_PARAMETER; + } + + return rc; +} diff --git a/hw/npu.c b/hw/npu.c new file mode 100644 index 00000000..c9bc12ba --- /dev/null +++ b/hw/npu.c @@ -0,0 +1,1718 @@ +/* Copyright 2013-2015 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <skiboot.h> +#include <io.h> +#include <timebase.h> +#include <pci.h> +#include <pci-cfg.h> +#include <interrupts.h> +#include <opal.h> +#include <opal-api.h> +#include <cpu.h> +#include <device.h> +#include <ccan/str/str.h> +#include <ccan/array_size/array_size.h> +#include <affinity.h> +#include <npu-regs.h> +#include <npu.h> +#include <lock.h> +#include <xscom.h> + +/* + * Terminology: + * + * Brick - A group of either 8 TX or 8 RX lanes + * Link - A group of 8 TX and 8 RX lanes + * + * Each link is represented in system software as an emulated PCI + * device. Garrison has two chips each with 4 links, therefore there + * are 8 emulated PCI devices in total. + * + * +----------------------------------------------------------------+ + * | PBCQ3 (SCOM Base Address 0x2012c00) | + * | PHB3 (SCOM Base Address 0x9012c00) | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * | GPU0 | + * +--------------------------------+-------------------------------+ + * | NV Link 1 | NV Link 0 | + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * | Lanes [0:7] PHY 0 Lanes [8:15] | + * | SCOM Base Address 0x8000080008010c3f | + * +--------------------------------+-------------------------------+ + * | Link 0 NDL/NTL | Link 1 NTL/NDL | + * | SCOM Base Address 0x8013c00 | SCOM Base Address 0x8013c40 | + * +--------------------------------+-------------------------------+ + * | | + * | Address Translation/AT (shared for all links) | + * | SCOM Base Address 0x8013d80 | + * | | + * +--------------------------------+-------------------------------+ + * | Link 3 NDL/NTL | Link 4 NTL/NDL | + * | SCOM Base Address 0x8013d00 | SCOM Base Address 0x8013d40 | + * +--------------------------------+-------------------------------+ + * | Lanes [8:15] PHY 1 Lanes [0:7] | + * | SCOM Base Address 0x8000080008010c7f | + * +---------------+----------------+---------------+---------------+ + * | TX | RX | TX | RX | + * +---------------+----------------+---------------+---------------+ + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * |||||||| |||||||| |||||||| |||||||| + * +---------------+----------------+---------------+---------------+ + * | RX | TX | RX | TX | + * +---------------+----------------+---------------+---------------+ + * | NV Link 2 | NV Link 3 | + * +--------------------------------+-------------------------------+ + * | GPU1 | + * +----------------------------------------------------------------+ + * | PCIe x8 | + * +----------------------------------------------------------------+ + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * |||||||| |||||||| + * +----------------------------------------------------------------+ + * | PHB2 (SCOM Base Address 0x9012800) | + * | PBCQ2 (SCOM Base Address 0x2012800) | + * +----------------------------------------------------------------+ + * + */ + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id); + +/* PCI config raw accessors */ +#define NPU_DEV_CFG_NORMAL_RD(d, o, s, v) \ + npu_dev_cfg_read_raw(d, NPU_DEV_CFG_NORMAL, o, s, v) +#define NPU_DEV_CFG_NORMAL_WR(d, o, s, v) \ + npu_dev_cfg_write_raw(d, NPU_DEV_CFG_NORMAL, o, s, v) +#define NPU_DEV_CFG_RDONLY_RD(d, o, s, v) \ + npu_dev_cfg_read_raw(d, NPU_DEV_CFG_RDONLY, o, s, v) +#define NPU_DEV_CFG_RDONLY_WR(d, o, s, v) \ + npu_dev_cfg_write_raw(d, NPU_DEV_CFG_RDONLY, o, s, v) +#define NPU_DEV_CFG_W1CLR_RD(d, o, s, v) \ + npu_dev_cfg_read_raw(d, NPU_DEV_CFG_W1CLR, o, s, v) +#define NPU_DEV_CFG_W1CLR_WR(d, o, s, v) \ + npu_dev_cfg_write_raw(d, NPU_DEV_CFG_W1CLR, o, s, v) + +#define NPU_DEV_CFG_INIT(d, o, s, v, ro, w1) \ + do { \ + NPU_DEV_CFG_NORMAL_WR(d, o, s, v); \ + NPU_DEV_CFG_RDONLY_WR(d, o, s, ro); \ + NPU_DEV_CFG_W1CLR_WR(d, o, s, w1); \ + } while(0) + +#define NPU_DEV_CFG_INIT_RO(d, o, s, v) \ + NPU_DEV_CFG_INIT(d, o, s, v, 0xffffffff, 0) + +static void npu_dev_cfg_read_raw(struct npu_dev *dev, + uint32_t index, + uint32_t offset, + uint32_t size, + uint32_t *val) +{ + uint8_t *pcfg = dev->config[index]; + uint32_t r, t, i; + + r = 0; + for (i = 0; i < size; i++) { + t = pcfg[offset + i]; + r |= (t << (i * 8)); + } + + *val = r; +} + +static void npu_dev_cfg_write_raw(struct npu_dev *dev, + uint32_t index, + uint32_t offset, + uint32_t size, + uint32_t val) +{ + uint8_t *pcfg = dev->config[index]; + uint32_t i; + + for (i = offset; i < (offset + size); i++) { + pcfg[i] = val; + val = (val >> 8); + } +} + +/* Returns the scom base for the given link index */ +static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base, + int index) +{ + struct dt_node *link; + uint32_t link_index; + char namebuf[32]; + + snprintf(namebuf, sizeof(namebuf), "link@%x", index); + link = dt_find_by_name(dn, namebuf); + assert(link); + link_index = dt_prop_get_u32(link, "ibm,npu-link-index"); + return scom_base + (link_index * NPU_LINK_SIZE); +} + +static uint64_t get_bar_size(uint64_t bar) +{ + return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000; +} + +static void npu_lock(struct phb *phb) +{ + struct npu *p = phb_to_npu(phb); + + lock(&p->lock); +} + +static void npu_unlock(struct phb *phb) +{ + struct npu *p = phb_to_npu(phb); + + unlock(&p->lock); +} + +/* Update the changes of the device BAR to link BARs */ +static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar, + bool enable) +{ + uint64_t val; + + if (!bar->xscom) + return; + + val = bar->base; + val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000)); + if (enable) + val |= NX_MMIO_BAR_ENABLE; + xscom_write(gcid, bar->xscom, val); +} + +/* Trap for PCI command (0x4) to enable or disable device's BARs */ +static int64_t npu_dev_cfg_write_cmd(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t data) +{ + struct npu_dev *dev = trap->dev; + bool enable; + + if (offset != PCI_CFG_CMD) + return OPAL_PARAMETER; + if (size != 1 && size != 2 && size != 4) + return OPAL_PARAMETER; + + /* Update device BARs and link BARs will be syncrhonized + * with hardware automatically. + */ + enable = !!(data & PCI_CFG_CMD_MEM_EN); + npu_dev_bar_update(dev->npu->chip_id, &dev->bar, enable); + + /* Normal path to update PCI config buffer */ + return OPAL_PARAMETER; +} + +/* + * Trap for memory BARs: 0xFF's should be written to BAR register + * prior to getting its size. + */ +static int64_t npu_dev_cfg_read_bar(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t *data) +{ + struct npu_dev_bar *bar = trap->data; + + /* Revert to normal path if we weren't trapped for BAR size */ + if (!bar->trapped) + return OPAL_PARAMETER; + + if (offset != trap->start && + offset != trap->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + bar->trapped = false; + *data = bar->bar_sz; + return OPAL_SUCCESS; +} + +static int64_t npu_dev_cfg_write_bar(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t data) +{ + struct npu_dev_bar *bar = trap->data; + struct npu_dev *dev = container_of(bar, struct npu_dev, bar); + uint32_t pci_cmd; + + if (offset != trap->start && + offset != trap->start + 4) + return OPAL_PARAMETER; + if (size != 4) + return OPAL_PARAMETER; + + /* Return BAR size on next read */ + if (data == 0xffffffff) { + bar->trapped = true; + if (offset == trap->start) + bar->bar_sz = (bar->size & 0xffffffff); + else + bar->bar_sz = (bar->size >> 32); + + return OPAL_SUCCESS; + } + + /* Update BAR base address */ + if (offset == trap->start) { + bar->base &= 0xffffffff00000000; + bar->base |= (data & 0xfffffff0); + } else { + bar->base &= 0x00000000ffffffff; + bar->base |= ((uint64_t)data << 32); + + NPU_DEV_CFG_NORMAL_RD(dev, PCI_CFG_CMD, 4, &pci_cmd); + npu_dev_bar_update(dev->npu->chip_id, bar, + !!(pci_cmd & PCI_CFG_CMD_MEM_EN)); + } + + /* We still depend on the normal path to update the + * cached config buffer. + */ + return OPAL_PARAMETER; +} + +static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn) +{ + int i; + + /* Sanity check */ + if (bdfn & ~0xff) + return NULL; + + for(i = 0; i < p->total_devices; i++) { + if (p->devices[i].bdfn == bdfn) + return &p->devices[i]; + } + + return NULL; + +} + +static struct npu_dev *npu_dev_cfg_check(struct npu *p, + uint32_t bdfn, + uint32_t offset, + uint32_t size) +{ + /* Sanity check */ + if (offset >= NPU_DEV_CFG_SIZE) + return NULL; + if (offset & (size - 1)) + return NULL; + + return bdfn_to_npu_dev(p, bdfn); +} + +static struct npu_dev_trap *npu_dev_trap_check(struct npu_dev *dev, + uint32_t offset, + uint32_t size, + bool read) +{ + struct npu_dev_trap *trap; + + list_for_each(&dev->traps, trap, link) { + if (read && !trap->read) + continue; + if (!read && !trap->write) + continue; + + /* The requested region is overlapped with the one + * specified by the trap, to pick the trap and let it + * handle the request + */ + if (offset <= trap->end && + (offset + size - 1) >= trap->start) + return trap; + } + + return NULL; +} + +static int64_t _npu_dev_cfg_read(struct phb *phb, uint32_t bdfn, + uint32_t offset, uint32_t *data, + size_t size) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + struct npu_dev_trap *trap; + int64_t ret; + + /* Data returned upon errors */ + *data = 0xffffffff; + + /* Retrieve NPU device */ + dev = npu_dev_cfg_check(p, bdfn, offset, size); + if (!dev) + return OPAL_PARAMETER; + + /* Retrieve trap */ + trap = npu_dev_trap_check(dev, offset, size, true); + if (trap) { + ret = trap->read(trap, offset, + size, (uint32_t *)data); + if (ret == OPAL_SUCCESS) + return ret; + } + + NPU_DEV_CFG_NORMAL_RD(dev, offset, size, data); + + return OPAL_SUCCESS; +} + +#define NPU_DEV_CFG_READ(size, type) \ +static int64_t npu_dev_cfg_read##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type *data) \ +{ \ + int64_t rc; \ + uint32_t val; \ + \ + /* Data returned upon errors */ \ + rc = _npu_dev_cfg_read(phb, bdfn, offset, &val, sizeof(*data)); \ + *data = (type)val; \ + return rc; \ +} + +static int64_t _npu_dev_cfg_write(struct phb *phb, uint32_t bdfn, + uint32_t offset, uint32_t data, + size_t size) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + struct npu_dev_trap *trap; + uint32_t val, v, r, c, i; + int64_t ret; + + /* Retrieve NPU device */ + dev = npu_dev_cfg_check(p, bdfn, offset, size); + if (!dev) + return OPAL_PARAMETER; + + /* Retrieve trap */ + trap = npu_dev_trap_check(dev, offset, size, false); + if (trap) { + ret = trap->write(trap, offset, + size, (uint32_t)data); + if (ret == OPAL_SUCCESS) + return ret; + } + + /* Handle read-only and W1C bits */ + val = data; + for (i = 0; i < size; i++) { + v = dev->config[NPU_DEV_CFG_NORMAL][offset + i]; + r = dev->config[NPU_DEV_CFG_RDONLY][offset + i]; + c = dev->config[NPU_DEV_CFG_W1CLR][offset + i]; + + /* Drop read-only bits */ + val &= ~(r << (i * 8)); + val |= (r & v) << (i * 8); + + /* Drop W1C bits */ + val &= ~(val & ((c & v) << (i * 8))); + } + + NPU_DEV_CFG_NORMAL_WR(dev, offset, size, val); + return OPAL_SUCCESS; +} + +#define NPU_DEV_CFG_WRITE(size, type) \ +static int64_t npu_dev_cfg_write##size(struct phb *phb, uint32_t bdfn, \ + uint32_t offset, type data) \ +{ \ + return _npu_dev_cfg_write(phb, bdfn, offset, \ + data, sizeof(data)); \ +} + +NPU_DEV_CFG_READ(8, u8) +NPU_DEV_CFG_READ(16, u16) +NPU_DEV_CFG_READ(32, u32) +NPU_DEV_CFG_WRITE(8, u8) +NPU_DEV_CFG_WRITE(16, u16) +NPU_DEV_CFG_WRITE(32, u32) + +/* + * Add calls to trap reads and writes to a NPU config space. + */ +static void npu_dev_add_cfg_trap(struct npu_dev *dev, uint32_t start, + uint32_t size, void *data, + int64_t (*read)(struct npu_dev_trap *, + uint32_t, + uint32_t, + uint32_t *), + int64_t (*write)(struct npu_dev_trap *, + uint32_t, + uint32_t, + uint32_t)) +{ + struct npu_dev_trap *trap; + + trap = zalloc(sizeof(struct npu_dev_trap)); + assert(trap); + trap->dev = dev; + trap->start = start; + trap->end = start + size - 1; + trap->read = read; + trap->write = write; + trap->data = data; + list_add_tail(&dev->traps, &trap->link); +} + +static int __npu_dev_bind_pci_dev(struct phb *phb __unused, + struct pci_device *pd, + void *data) +{ + struct npu_dev *dev = data; + struct dt_node *pci_dt_node; + uint32_t npu_npcq_phandle; + + /* Ignore non-nvidia PCI devices */ + if ((pd->vdid & 0xffff) != 0x10de) + return 0; + + /* Find the PCI devices pbcq */ + for (pci_dt_node = pd->dn->parent; + pci_dt_node && !dt_find_property(pci_dt_node, "ibm,pbcq"); + pci_dt_node = pci_dt_node->parent); + + if (!pci_dt_node) + return 0; + + npu_npcq_phandle = dt_prop_get_u32(dev->dt_node, "ibm,npu-pbcq"); + + if (dt_prop_get_u32(pci_dt_node, "ibm,pbcq") == npu_npcq_phandle && + (pd->vdid & 0xffff) == 0x10de) + return 1; + + return 0; +} + +static void npu_dev_bind_pci_dev(struct npu_dev *dev) +{ + struct phb *phb; + uint32_t i; + + if (dev->pd) + return; + + for (i = 0; i < 64; i++) { + if (dev->npu->phb.opal_id == i) + continue; + + phb = pci_get_phb(i); + if (!phb) + continue; + + dev->pd = pci_walk_dev(phb, __npu_dev_bind_pci_dev, dev); + if (dev->pd) { + dev->phb = phb; + return; + } + } + + prlog(PR_ERR, "%s: NPU device %04x:00:%02x.0 not binding to PCI device\n", + __func__, dev->npu->phb.opal_id, dev->index); +} + +static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED; + +/* Appends an NPU phandle to the given PCI device node ibm,npu + * property */ +static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle) +{ + uint32_t *npu_phandles; + struct dt_property *pci_npu_phandle_prop; + size_t prop_len; + + /* Use a lock to make sure no one else has a reference to an + * ibm,npu property (this assumes this is the only function + * that holds a reference to it). */ + lock(&pci_npu_phandle_lock); + + /* This function shouldn't be called unless ibm,npu exists */ + pci_npu_phandle_prop = (struct dt_property *) + dt_require_property(dn, "ibm,npu", -1); + + /* Need to append to the properties */ + prop_len = pci_npu_phandle_prop->len; + prop_len += sizeof(*npu_phandles); + dt_resize_property(&pci_npu_phandle_prop, prop_len); + pci_npu_phandle_prop->len = prop_len; + + npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop; + npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle; + unlock(&pci_npu_phandle_lock); +} + +static void npu_dn_fixup(struct phb *phb, struct pci_device *pd) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + + dev = bdfn_to_npu_dev(p, pd->bdfn); + assert(dev); + + if (dev->phb || dev->pd) + return; + + /* Bind the emulated PCI device with the real one, which can't + * be done until the PCI devices are populated. Once the real + * PCI device is identified, we also need fix the device-tree + * for it + */ + npu_dev_bind_pci_dev(dev); + if (dev->phb && dev->pd && dev->pd->dn) { + if (dt_find_property(dev->pd->dn, "ibm,npu")) + npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle); + else + dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle); + + dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle); + } +} + +static void npu_ioda_init(struct npu *p) +{ + uint64_t *data64; + uint32_t i; + + /* LXIVT - Disable all LSIs */ + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) { + data64 = &p->lxive_cache[i]; + *data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff); + *data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0); + } + + /* PCT - Reset to reserved PE# */ + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) { + data64 = &p->pce_cache[i]; + *data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, NPU_NUM_OF_PES); + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + } + + /* Clear TVT */ + memset(p->tve_cache, 0, sizeof(p->tve_cache)); +} + +static int64_t npu_ioda_reset(struct phb *phb, bool purge) +{ + struct npu *p = phb_to_npu(phb); + uint32_t i; + + if (purge) { + NPUDBG(p, "Purging all IODA tables...\n"); + npu_ioda_init(p); + } + + /* LIST */ + npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true); + for (i = 0; i < 8; i++) + out_be64(p->at_regs + NPU_IODA_DATA0, 0x1); + + /* LIXVT */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]); + + /* PCT */ + npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]); + + /* TVT */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true); + for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++) + out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]); + + return OPAL_SUCCESS; +} + +static int npu_isn_valid(struct npu *p, uint32_t isn) +{ + if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 || + NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN || + NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) { + NPUERR(p, "isn 0x%x not valid for this NPU\n", isn); + return false; + } + + return true; +} + +static int64_t npu_lsi_get_xive(void *data, + uint32_t isn, + uint16_t *server, + uint8_t *prio) +{ + struct npu *p = data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* The content is fetched from the cache, which requires + * that the initial cache should be initialized with the + * default values + */ + irq -= NPU_LSI_IRQ_MIN; + lxive = p->lxive_cache[irq]; + *server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive); + *prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive); + + return OPAL_SUCCESS; +} + +static int64_t npu_lsi_set_xive(void *data, + uint32_t isn, + uint16_t server, + uint8_t prio) +{ + struct npu *p = data; + uint32_t irq = NPU_IRQ_NUM(isn); + uint64_t lxive; + + if (!npu_isn_valid(p, isn)) + return OPAL_PARAMETER; + + /* Figure out LXIVT entry */ + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + + /* Cache LXIVT entry */ + irq -= NPU_LSI_IRQ_MIN; + p->lxive_cache[irq] = lxive; + + /* Update to LXIVT entry */ + npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false); + lxive = in_be64(p->at_regs + NPU_IODA_DATA0); + lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server); + lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio); + out_be64(p->at_regs + NPU_IODA_DATA0, lxive); + + return OPAL_SUCCESS; +} + +static void npu_err_interrupt(void *data, uint32_t isn) +{ + struct npu *p = data; + uint32_t irq = NPU_IRQ_NUM(isn); + + if (!npu_isn_valid(p, isn)) + return; + + /* There're 4 LSIs used for error reporting: 4/5 for data + * link error reporting while 6/7 for frozen PE detection + */ + irq -= NPU_LSI_IRQ_MIN; + switch (irq) { + case 4 ... 5: + prerror("Invalid NPU error interrupt received\n"); + break; + case 6 ... 7: + NPUERR(p, "Error handling not implemented\n"); + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, + OPAL_EVENT_PCI_ERROR); + } +} + +/* LSIs (OS owned) */ +static const struct irq_source_ops npu_lsi_irq_ops = { + .get_xive = npu_lsi_get_xive, + .set_xive = npu_lsi_set_xive, +}; + +/* Error LSIs (skiboot owned) */ +static const struct irq_source_ops npu_err_lsi_irq_ops = { + .get_xive = npu_lsi_get_xive, + .set_xive = npu_lsi_set_xive, + .interrupt = npu_err_interrupt, +}; + +static void npu_register_irq(struct npu *p) +{ + register_irq_source(&npu_lsi_irq_ops, p, + p->base_lsi, 4); + register_irq_source(&npu_err_lsi_irq_ops, p, + p->base_lsi + 4, 4); +} + +static void npu_hw_init(struct npu *p) +{ + /* 3 MMIO setup for AT */ + out_be64(p->at_regs + NPU_LSI_SOURCE_ID, + SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, 0x7f)); + out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul); + npu_ioda_reset(&p->phb, false); +} + +static int64_t npu_map_pe_dma_window_real(struct phb *phb, + uint16_t pe_num, + uint16_t window_id, + uint64_t pci_start_addr, + uint64_t pci_mem_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t end = pci_start_addr + pci_mem_size; + uint64_t tve; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_num >= NPU_NUM_OF_PES || + window_id != pe_num) + return OPAL_PARAMETER; + + if (pci_mem_size) { + /* Enable */ + + end = pci_start_addr + pci_mem_size; + + /* We have to be 16M aligned */ + if ((pci_start_addr & 0x00ffffff) || + (pci_mem_size & 0x00ffffff)) + return OPAL_PARAMETER; + + /* + * It *looks* like this is the max we can support (we need + * to verify this. Also we are not checking for rollover, + * but then we aren't trying too hard to protect ourselves + * againt a completely broken OS. + */ + if (end > 0x0003ffffffffffffull) + return OPAL_PARAMETER; + + /* + * Put start address bits 49:24 into TVE[52:53]||[0:23] + * and end address bits 49:24 into TVE[54:55]||[24:47] + * and set TVE[51] + */ + tve = (pci_start_addr << 16) & (0xffffffull << 48); + tve |= (pci_start_addr >> 38) & (3ull << 10); + tve |= (end >> 8) & (0xfffffful << 16); + tve |= (end >> 40) & (3ull << 8); + tve |= PPC_BIT(51); + } else { + /* Disable */ + tve = 0; + } + + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, tve); + p->tve_cache[window_id] = tve; + + return OPAL_SUCCESS; +} + +static int64_t npu_map_pe_dma_window(struct phb *phb, + uint16_t pe_num, + uint16_t window_id, + uint16_t tce_levels, + uint64_t tce_table_addr, + uint64_t tce_table_size, + uint64_t tce_page_size) +{ + struct npu *p = phb_to_npu(phb); + uint64_t tts_encoded; + uint64_t data64 = 0; + + /* Sanity check. Each PE has one corresponding TVE */ + if (pe_num >= NPU_NUM_OF_PES || + window_id != pe_num) + return OPAL_PARAMETER; + + /* Special condition, zero TCE table size used to disable + * the TVE. + */ + if (!tce_table_size) { + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, 0ul); + p->tve_cache[window_id] = 0ul; + return OPAL_SUCCESS; + } + + /* Additional arguments validation */ + if (tce_levels < 1 || + tce_levels > 4 || + !is_pow2(tce_table_size) || + tce_table_size < 0x1000) + return OPAL_PARAMETER; + + /* TCE table size */ + data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12); + tts_encoded = ilog2(tce_table_size) - 11; + if (tts_encoded > 39) + return OPAL_PARAMETER; + data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded); + + /* TCE page size */ + switch (tce_page_size) { + case 0x10000: /* 64K */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5); + break; + case 0x1000000: /* 16M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13); + break; + case 0x10000000: /* 256M */ + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17); + break; + case 0x1000: /* 4K */ + default: + data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1); + } + + /* Number of levels */ + data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1); + + /* Update to hardware */ + npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false); + out_be64(p->at_regs + NPU_IODA_DATA0, data64); + p->tve_cache[window_id] = data64; + + return OPAL_SUCCESS; +} + +static int64_t npu_set_pe(struct phb *phb, + uint64_t pe_num, + uint64_t bdfn, + uint8_t bcompare, + uint8_t dcompare, + uint8_t fcompare, + uint8_t action) +{ + struct npu *p = phb_to_npu(phb); + struct npu_dev *dev; + uint32_t link_idx; + uint64_t *data64; + + /* Sanity check */ + if (action != OPAL_MAP_PE && + action != OPAL_UNMAP_PE) + return OPAL_PARAMETER; + if (pe_num >= NPU_NUM_OF_PES) + return OPAL_PARAMETER; + + /* All emulated PCI devices hooked to root bus, whose + * bus number is zero. + */ + dev = bdfn_to_npu_dev(p, bdfn); + if ((bdfn >> 8) || !dev) + return OPAL_PARAMETER; + + link_idx = dev->index; + + /* Separate links will be mapped to different PEs */ + if (bcompare != OpalPciBusAll || + dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER || + fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER) + return OPAL_UNSUPPORTED; + + /* Map the link to the corresponding PE */ + data64 = &p->pce_cache[link_idx]; + if (action == OPAL_MAP_PE) + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + pe_num); + else + *data64 = SETFIELD(NPU_IODA_PCT_PE, *data64, + NPU_NUM_OF_PES); + + *data64 |= NPU_IODA_PCT_LINK_ENABLED; + + npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false); + out_be64(p->at_regs + NPU_IODA_DATA0, *data64); + + return OPAL_SUCCESS; +} + +static int64_t npu_link_state(struct phb *phb __unused) +{ + /* As we're emulating all PCI stuff, the link bandwidth + * isn't big deal anyway. + */ + return OPAL_SHPC_LINK_UP_x1; +} + +static int64_t npu_power_state(struct phb *phb __unused) +{ + return OPAL_SHPC_POWER_ON; +} + +static int64_t npu_freset(struct phb *phb __unused) +{ + /* FIXME: PHB fundamental reset, which need to be + * figured out later. It's used by EEH recovery + * upon fenced AT. + */ + return OPAL_SUCCESS; +} + +static int64_t npu_freeze_status(struct phb *phb __unused, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type __unused, + uint16_t *severity __unused, + uint64_t *phb_status __unused) +{ + /* FIXME: When it's called by skiboot PCI config accessor, + * the PE number is fixed to 0, which is incorrect. We need + * introduce another PHB callback to translate it. For now, + * it keeps the skiboot PCI enumeration going. + */ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + return OPAL_SUCCESS; +} + +static const struct phb_ops npu_ops = { + .lock = npu_lock, + .unlock = npu_unlock, + .cfg_read8 = npu_dev_cfg_read8, + .cfg_read16 = npu_dev_cfg_read16, + .cfg_read32 = npu_dev_cfg_read32, + .cfg_write8 = npu_dev_cfg_write8, + .cfg_write16 = npu_dev_cfg_write16, + .cfg_write32 = npu_dev_cfg_write32, + .choose_bus = NULL, + .device_init = NULL, + .device_node_fixup = npu_dn_fixup, + .presence_detect = NULL, + .ioda_reset = npu_ioda_reset, + .papr_errinjct_reset = NULL, + .pci_reinit = NULL, + .set_phb_mem_window = NULL, + .phb_mmio_enable = NULL, + .map_pe_mmio_window = NULL, + .map_pe_dma_window = npu_map_pe_dma_window, + .map_pe_dma_window_real = npu_map_pe_dma_window_real, + .pci_msi_eoi = NULL, + .set_xive_pe = NULL, + .get_msi_32 = NULL, + .get_msi_64 = NULL, + .set_pe = npu_set_pe, + .set_peltv = NULL, + .link_state = npu_link_state, + .power_state = npu_power_state, + .slot_power_off = NULL, + .slot_power_on = NULL, + .hot_reset = NULL, + .fundamental_reset = npu_freset, + .complete_reset = NULL, + .poll = NULL, + .eeh_freeze_status = npu_freeze_status, + .eeh_freeze_clear = NULL, + .eeh_freeze_set = NULL, + .next_error = NULL, + .err_inject = NULL, + .get_diag_data = NULL, + .get_diag_data2 = NULL, + .set_capi_mode = NULL, + .set_capp_recovery = NULL, +}; + +static void assign_mmio_bars(uint32_t gcid, uint32_t xscom, + struct dt_node *npu_dn, uint64_t mm_win[2]) +{ + uint64_t mem_start, mem_end; + struct npu_dev_bar bar; + struct dt_node *link; + + /* Configure BAR selection. + * + * Currently, each PHY contains 2 links and each link has 2 + * BARs. The first BAR is assigned to the DLTL region which is + * what the kernel uses. The second BAR is either assigned to + * either the PL or AT region or unassigned. The PL0/PL1/AT + * MMIO regions are not exposed to the kernel so we assigned + * them at the start of the available memory area followed by + * the DLTL regions. So we end up with the following memory + * map (assuming we're given a memory region starting at + * 0x3fff000000000): + * + * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000 + * Link#0-BAR#1: PL0 BAR ( 2MB) - 0x3fff000000000 + * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000 + * Link#1-BAR#1: AT BAR ( 64KB) - 0x3fff000400000 + * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000 + * Link#2-BAR#1: PL1 BAR ( 2MB) - 0x3fff000200000 + * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000 + * Link#3-BAR#1: UNASSIGNED + */ + xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR, + 0x0211000043500000); + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0, + &mem_start); + mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12; + + xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0, + &mem_end); + mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) + + get_bar_size(mem_end); + + /* PL0 BAR comes first at 0x3fff000000000 */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1; + bar.base = mem_start; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* PL1 BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_PL_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Then the AT BAR */ + bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1; + bar.base += bar.size; + bar.size = NX_MMIO_AT_SIZE; + npu_dev_bar_update(gcid, &bar, true); + + /* Now we configure all the DLTL BARs. These are the ones + * actually exposed to the kernel. */ + mm_win[0] = bar.base + bar.size; + dt_for_each_node(npu_dn, link) { + uint32_t index; + + index = dt_prop_get_u32(link, "ibm,npu-link-index"); + bar.xscom = npu_link_scom_base(npu_dn, xscom, index) + + NX_MMIO_BAR_0; + bar.base += bar.size; + bar.size = NX_MMIO_DL_SIZE; + bar.base = ALIGN_UP(bar.base, bar.size); + npu_dev_bar_update(gcid, &bar, false); + } + mm_win[1] = (bar.base + bar.size) - mm_win[0]; + + /* If we weren't given enough room to setup all the BARs we + * require it's better to crash here than risk creating + * overlapping BARs which will xstop the machine randomly in + * the future.*/ + assert(bar.base + bar.size <= mem_end); +} + +/* Probe NPU device node and create PCI root device node + * accordingly. The NPU deivce node should specify number + * of links and xscom base address to access links. + */ +static void npu_probe_phb(struct dt_node *dn) +{ + struct dt_node *np; + uint32_t gcid, index, xscom; + uint64_t at_bar[2], mm_win[2], val; + uint32_t links = 0; + char *path; + + /* Retrieve chip id */ + path = dt_get_path(dn); + gcid = dt_get_chip_id(dn); + index = dt_prop_get_u32(dn, "ibm,npu-index"); + dt_for_each_compatible(dn, np, "ibm,npu-link") + links++; + + prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n", + gcid, index, links, path); + free(path); + + /* Retrieve xscom base addr */ + xscom = dt_get_address(dn, 0, NULL); + prlog(PR_INFO, " XSCOM Base: %08x\n", xscom); + + assign_mmio_bars(gcid, xscom, dn, mm_win); + + /* Retrieve AT BAR */ + xscom_read(gcid, npu_link_scom_base(dn, xscom, 1) + NX_MMIO_BAR_1, + &val); + if (!(val & NX_MMIO_BAR_ENABLE)) { + prlog(PR_ERR, " AT BAR disabled!\n"); + return; + } + + at_bar[0] = GETFIELD(NX_MMIO_BAR_BASE, val) << 12; + at_bar[1] = get_bar_size(val); + prlog(PR_INFO, " AT BAR: %016llx (%lldKB)\n", + at_bar[0], at_bar[1] / 0x400); + + /* Create PCI root device node */ + np = dt_new_addr(dt_root, "pciex", at_bar[0]); + if (!np) { + prlog(PR_ERR, "%s: Cannot create PHB device node\n", + __func__); + return; + } + + dt_add_property_strings(np, "compatible", + "ibm,power8-npu-pciex", "ibm,ioda2-npu-phb"); + dt_add_property_strings(np, "device_type", "pciex"); + dt_add_property(np, "reg", at_bar, sizeof(at_bar)); + + dt_add_property_cells(np, "ibm,phb-index", index); + dt_add_property_cells(np, "ibm,chip-id", gcid); + dt_add_property_cells(np, "ibm,xscom-base", xscom); + dt_add_property_cells(np, "ibm,npcq", dn->phandle); + dt_add_property_cells(np, "ibm,links", links); + dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win)); +} + +static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + uint32_t offset = cap->start; + uint32_t val; + + /* Add version and length information */ + val = (cap->end - cap->start) | 0x1 << 8; + NPU_DEV_CFG_INIT_RO(dev, offset + 2, 4, val); + offset += 4; + + /* Defaults when the trap can't handle the read/write (eg. due + * to reading/writing less than 4 bytes). */ + val = 0x0; + NPU_DEV_CFG_INIT_RO(dev, offset, 4, val); + NPU_DEV_CFG_INIT_RO(dev, offset + 4, 4, val); + + /* Create a trap for AT/PL procedures */ + npu_dev_add_cfg_trap(dev, offset, 8, NULL, npu_dev_procedure_read, + npu_dev_procedure_write); + offset += 8; + + NPU_DEV_CFG_INIT_RO(dev, offset, 4, dev->index); +} + +static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap) +{ + struct npu_dev *dev = cap->dev; + uint32_t base = cap->start; + uint32_t val; + + /* Sanity check on capability ID */ + if (cap->id != PCI_CFG_CAP_ID_EXP) { + prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n", + __func__, cap->id, PCI_CFG_CAP_ID_EXP); + return; + } + + /* Sanity check on spanned registers */ + if ((cap->end - cap->start) < 0x40) { + prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n", + __func__, cap->start, cap->end, cap->id); + return; + } + + /* 0x00 - ID/PCIE capability */ + val = cap->id; + val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20)); + NPU_DEV_CFG_INIT_RO(dev, base, 4, val); + + /* 0x04 - Device capability + * + * We should support FLR. Oterwhsie, it might have + * problem passing it through to userland via Linux + * VFIO infrastructure + */ + val = ((PCIE_MPSS_128) | + (PCIE_PHANTOM_NONE << 3) | + (PCIE_L0SL_MAX_NO_LIMIT << 6) | + (PCIE_L1L_MAX_NO_LIMIT << 9) | + (PCICAP_EXP_DEVCAP_FUNC_RESET)); + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_DEVCAP, 4, val); + + /* 0x08 - Device control and status */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_DEVCTL, 4, 0x00002810, + 0xffff0000, 0x000f0000); + + /* 0x0c - Link capability */ + val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4)); + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_LCAP, 4, val); + + /* 0x10 - Link control and status */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_LCTL, 4, 0x00130000, + 0xfffff000, 0xc0000000); + + /* 0x14 - Slot capability */ + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000); + + /* 0x18 - Slot control and status */ + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000); + + /* 0x1c - Root control and capability */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_RC, 4, 0x00000000, + 0xffffffe0, 0x00000000); + + /* 0x20 - Root status */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_RSTAT, 4, 0x00000000, + 0xffffffff, 0x00010000); + + /* 0x24 - Device capability 2 */ + NPU_DEV_CFG_INIT_RO(dev, base + PCIECAP_EXP_DCAP2, 4, 0x00000000); + + /* 0x28 - Device Control and status 2 */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_DCTL2, 4, 0x00070000, + 0xffff0000, 0x00000000); + + /* 0x2c - Link capability 2 */ + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_LCAP2, 4, 0x00000007); + + /* 0x30 - Link control and status 2 */ + NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_LCTL2, 4, 0x00000003, + 0xffff0000, 0x00200000); + + /* 0x34 - Slot capability 2 */ + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SCAP2, 4, 0x00000000); + + /* 0x38 - Slot control and status 2 */ + NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SCTL2, 4, 0x00000000); +} + +static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev, + void (*populate)(struct npu_dev_cap *), + uint16_t id, + uint16_t start, + uint16_t end) +{ + struct npu_dev_cap *cap; + + /* Check if the capability is existing */ + cap = npu_dev_find_capability(dev, id); + if (cap) + return cap; + + /* Allocate new one */ + cap = zalloc(sizeof(struct npu_dev_cap)); + assert(cap); + + /* Put it into the pool */ + cap->id = id; + cap->start = start; + cap->end = end; + cap->dev = dev; + cap->populate = populate; + list_add_tail(&dev->capabilities, &cap->link); + + return cap; +} + +static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev, + uint16_t id) +{ + struct npu_dev_cap *cap; + + list_for_each(&dev->capabilities, cap, link) { + if (cap->id == id) + return cap; + } + + return NULL; +} + +/* + * All capabilities should be put into the device capability + * list according to register offset in ascending order for + * easy access at later point. + */ +static void npu_dev_create_capabilities(struct npu_dev *dev) +{ + list_head_init(&dev->capabilities); + + /* PCI express capability */ + npu_dev_create_capability(dev, npu_dev_populate_pcie_cap, + PCI_CFG_CAP_ID_EXP, 0x40, 0x80); + + /* Vendor specific capability */ + npu_dev_create_capability(dev, npu_dev_populate_vendor_cap, + PCI_CFG_CAP_ID_VENDOR, 0x80, 0x90); +} + +static void npu_dev_create_cfg(struct npu_dev *dev) +{ + struct npu_dev_cap *cap; + uint32_t offset; + uint32_t last_cap_offset; + + /* Initialize config traps */ + list_head_init(&dev->traps); + + /* 0x00 - Vendor/Device ID */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_VENDOR_ID, 4, 0x04ea1014); + + /* 0x04 - Command/Status + * + * Create one trap to trace toggling memory BAR enable bit + */ + NPU_DEV_CFG_INIT(dev, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8, + 0xf9000000); + + npu_dev_add_cfg_trap(dev, PCI_CFG_CMD, 1, NULL, NULL, + npu_dev_cfg_write_cmd); + + /* 0x08 - Rev/Class/Cache */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_REV_ID, 4, 0x06800100); + + /* 0x0c - CLS/Latency Timer/Header/BIST */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000); + + /* 0x10 - BARs, always 64-bits non-prefetchable + * + * Each emulated device represents one link and therefore + * there is one BAR for the assocaited DLTL region. + */ + + /* Low 32-bits */ + NPU_DEV_CFG_INIT(dev, PCI_CFG_BAR0, 4, + (dev->bar.base & 0xfffffff0) | dev->bar.flags, + 0x0000000f, 0x00000000); + + /* High 32-bits */ + NPU_DEV_CFG_INIT(dev, PCI_CFG_BAR1, 4, (dev->bar.base >> 32), + 0x00000000, 0x00000000); + + /* + * Create trap. Writting 0xFF's to BAR registers should be + * trapped and return size on next read + */ + npu_dev_add_cfg_trap(dev, PCI_CFG_BAR0, 8, &dev->bar, + npu_dev_cfg_read_bar, npu_dev_cfg_write_bar); + + /* 0x18/1c/20/24 - Disabled BAR#2/3/4/5 + * + * Mark those BARs readonly so that 0x0 will be returned when + * probing the length and the BARs will be skipped. + */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR2, 4, 0x00000000); + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR3, 4, 0x00000000); + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR4, 4, 0x00000000); + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR5, 4, 0x00000000); + + /* 0x28 - Cardbus CIS pointer */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CARDBUS_CIS, 4, 0x00000000); + + /* 0x2c - Subsystem ID */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000); + + /* 0x30 - ROM BAR + * + * Force its size to be zero so that the kernel will skip + * probing the ROM BAR. We needn't emulate ROM BAR. + */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_ROMBAR, 4, 0xffffffff); + + /* 0x34 - PCI Capability + * + * By default, we don't have any capabilities + */ + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CAP, 4, 0x00000000); + + last_cap_offset = PCI_CFG_CAP - 1; + list_for_each(&dev->capabilities, cap, link) { + offset = cap->start; + + /* Initialize config space for the capability */ + if (cap->populate) + cap->populate(cap); + + /* Add capability header */ + NPU_DEV_CFG_INIT_RO(dev, offset, 2, cap->id); + + /* Update the next capability pointer */ + NPU_DEV_CFG_NORMAL_WR(dev, last_cap_offset + 1, 1, offset); + + last_cap_offset = offset; + } + + /* 0x38 - Reserved */ + NPU_DEV_CFG_INIT_RO(dev, 0x38, 4, 0x00000000); + + /* 0x3c - INT line/pin/Minimal grant/Maximal latency */ + if (!(dev->index % 2)) + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_INT_LINE, 4, 0x00000100); + else + NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_INT_LINE, 4, 0x00000200); +} + +static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t pbcq) +{ + int i; + int dev = -1; + int bdfn = -1; + + /* Find the highest function number alloacted to emulated PCI + * devices associated with this GPU. */ + for(i = 0; i < p->total_devices; i++) { + int dev_bdfn = p->devices[i].bdfn; + dev = MAX(dev, dev_bdfn & 0xf8); + + if (dt_prop_get_u32(p->devices[i].dt_node, + "ibm,npu-pbcq") == pbcq) + bdfn = MAX(bdfn, dev_bdfn); + } + + if (bdfn >= 0) + /* Device has already been allocated for this GPU so + * assign the emulated PCI device the next + * function. */ + return bdfn + 1; + else if (dev >= 0) + /* Otherwise allocate a new device and allocate + * function 0. */ + return dev + (1 << 3); + else + return 0; +} + +static void npu_create_devices(struct dt_node *dn, struct npu *p) +{ + struct npu_dev *dev; + struct dt_node *npu_dn, *link; + uint32_t npu_phandle, index = 0; + uint64_t buid; + uint64_t lsisrcid; + + lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE, + in_be64(p->at_regs + NPU_LSI_SOURCE_ID)); + buid = SETFIELD(NP_BUID_BASE, 0ull, + (p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) | lsisrcid)); + buid |= NP_BUID_ENABLE; + + /* Get the npu node which has the links which we expand here + * into pci like devices attached to our emulated phb. */ + npu_phandle = dt_prop_get_u32(dn, "ibm,npcq"); + npu_dn = dt_find_by_phandle(dt_root, npu_phandle); + assert(npu_dn); + + /* Walk the link@x nodes to initialize devices */ + p->total_devices = 0; + p->phb.scan_map = 0; + dt_for_each_compatible(npu_dn, link, "ibm,npu-link") { + struct npu_dev_bar *bar; + uint32_t pbcq; + uint64_t val; + uint32_t j; + + dev = &p->devices[index]; + dev->index = dt_prop_get_u32(link, "ibm,npu-link-index"); + dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base, + dev->index); + + dev->npu = p; + dev->dt_node = link; + + /* We don't support MMIO PHY access yet */ + dev->pl_base = NULL; + + pbcq = dt_prop_get_u32(link, "ibm,npu-pbcq"); + dev->bdfn = npu_allocate_bdfn(p, pbcq); + + /* This must be done after calling + * npu_allocate_bdfn() */ + p->total_devices++; + p->phb.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3); + + dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy"); + dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); + + /* Setup BUID/ISRN */ + xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid); + + /* Setup emulated config space */ + for (j = 0; j < NPU_DEV_CFG_MAX; j++) + dev->config[j] = zalloc(NPU_DEV_CFG_SIZE); + bar = &dev->bar; + bar->flags = (PCI_CFG_BAR_TYPE_MEM | + PCI_CFG_BAR_MEM64); + + /* Update BAR info */ + bar->xscom = dev->xscom + NX_MMIO_BAR_0; + xscom_read(p->chip_id, bar->xscom, &val); + bar->base = GETFIELD(NX_MMIO_BAR_BASE, val) << 12; + bar->size = get_bar_size(val); + + /* + * The config space is initialised with the BARs + * disabled, so make sure it is actually disabled in + * hardware. + */ + npu_dev_bar_update(p->chip_id, bar, false); + + /* Initialize capabilities */ + npu_dev_create_capabilities(dev); + + /* Initialize config space */ + npu_dev_create_cfg(dev); + + index++; + } +} + +static void npu_add_phb_properties(struct npu *p) +{ + struct dt_node *np = p->phb.dt_node; + uint32_t icsp = get_ics_phandle(); + uint64_t tkill, mm_base, mm_size; + uint32_t base_lsi = p->base_lsi; + uint32_t map[] = { 0x0, 0x0, 0x0, 0x1, icsp, base_lsi, + 0x0, 0x0, 0x0, 0x2, icsp, base_lsi + 1, + 0x800, 0x0, 0x0, 0x1, icsp, base_lsi + 2, + 0x800, 0x0, 0x0, 0x2, icsp, base_lsi + 3 }; + uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7}; + + /* Add various properties that HB doesn't have to + * add, some of them simply because they result from + * policy decisions made in skiboot rather than in HB + * such as the MMIO windows going to PCI, interrupts, + * etc. + */ + dt_add_property_cells(np, "#address-cells", 3); + dt_add_property_cells(np, "#size-cells", 2); + dt_add_property_cells(np, "#interrupt-cells", 1); + dt_add_property_cells(np, "bus-range", 0, 0xff); + dt_add_property_cells(np, "clock-frequency", 0x200, 0); + dt_add_property_cells(np, "interrupt-parent", icsp); + + /* DLPL Interrupts */ + p->phb.lstate.int_size = 1; + p->phb.lstate.int_val[0][0] = p->base_lsi + NPU_LSI_INT_DL0; + p->phb.lstate.int_val[1][0] = p->base_lsi + NPU_LSI_INT_DL1; + p->phb.lstate.int_val[2][0] = p->base_lsi + NPU_LSI_INT_DL2; + p->phb.lstate.int_val[3][0] = p->base_lsi + NPU_LSI_INT_DL3; + p->phb.lstate.int_parent[0] = icsp; + p->phb.lstate.int_parent[1] = icsp; + p->phb.lstate.int_parent[2] = icsp; + p->phb.lstate.int_parent[3] = icsp; + + /* Due to the way the emulated PCI devices are structured in + * the device tree the core PCI layer doesn't do this for + * us. Besides the swizzling wouldn't suit our needs even if it + * did. */ + dt_add_property(np, "interrupt-map", map, sizeof(map)); + dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask)); + + /* NPU PHB properties */ + /* TODO: Due to an errata TCE KILL only works when DMA traffic + * has been stopped. We need to implement the work around + * which is to do a TCE kill all instead. */ + tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL; + dt_add_property_cells(np, "ibm,opal-num-pes", + NPU_NUM_OF_PES); + dt_add_property_cells(np, "ibm,opal-reserved-pe", + NPU_NUM_OF_PES); + dt_add_property_cells(np, "ibm,opal-tce-kill", + hi32(tkill), lo32(tkill)); + + /* Memory window is exposed as 32-bits non-prefetchable + * one because 64-bits prefetchable one is kind of special + * to kernel. + */ + mm_base = p->mm_base; + mm_size = p->mm_size; + dt_add_property_cells(np, "ranges", 0x02000000, + hi32(mm_base), lo32(mm_base), + hi32(mm_base), lo32(mm_base), + hi32(mm_size), lo32(mm_size)); +} + +static void npu_create_phb(struct dt_node *dn) +{ + const struct dt_property *prop; + struct npu *p; + uint32_t links; + void *pmem; + + /* Retrieve number of devices */ + links = dt_prop_get_u32(dn, "ibm,links"); + pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev)); + assert(pmem); + + /* Populate PHB */ + p = pmem; + p->index = dt_prop_get_u32(dn, "ibm,phb-index"); + p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id"); + p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base"); + p->total_devices = links; + + /* This is the AT base */ + p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET; + p->at_regs = (void *)dt_get_address(dn, 0, NULL); + + prop = dt_require_property(dn, "ibm,mmio-window", -1); + assert(prop->len >= (2 * sizeof(uint64_t))); + p->mm_base = ((const uint64_t *)prop->prop)[0]; + p->mm_size = ((const uint64_t *)prop->prop)[1]; + + p->devices = pmem + sizeof(struct npu); + + /* Interrupt */ + p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) + + NPU_LSI_IRQ_MIN; + + /* Generic PHB */ + p->phb.dt_node = dn; + p->phb.ops = &npu_ops; + p->phb.phb_type = phb_type_pcie_v3; + + /* Populate devices */ + npu_create_devices(dn, p); + + /* Populate extra properties */ + npu_add_phb_properties(p); + + /* Register PHB */ + pci_register_phb(&p->phb, -1); + + /* Initialize IODA cache */ + npu_ioda_init(p); + + /* Register interrupt source */ + npu_register_irq(p); + + /* Initialize hardware */ + npu_hw_init(p); +} + +void probe_npu(void) +{ + struct dt_node *np; + + /* Scan NPU XSCOM nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu") + npu_probe_phb(np); + + /* Scan newly created PHB nodes */ + dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex") + npu_create_phb(np); +} diff --git a/include/npu-regs.h b/include/npu-regs.h new file mode 100644 index 00000000..f663a987 --- /dev/null +++ b/include/npu-regs.h @@ -0,0 +1,235 @@ +/* Copyright 2013-2015 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NPU_REGS_H +#define __NPU_REGS_H + +/* Size of a single link */ +#define NPU_LINK_SIZE 0x40 + +/* Link registers */ +#define NX_PB_ERR_RPT_0 0x00 +#define NX_PB_ERR_RPT_1 0x01 +#define NX_MMIO_BAR_0 0x02 +#define NX_MMIO_BAR_1 0x03 +#define NX_MMIO_BAR_BASE PPC_BITMASK(14,51) +#define NX_MMIO_BAR_ENABLE PPC_BIT(52) +#define NX_MMIO_BAR_SIZE PPC_BITMASK(53,55) +#define NX_NODAL_BAR0 0x04 +#define NX_NODAL_BAR1 0x05 +#define NX_NODAL_BAR_ENABLE PPC_BIT(0) +#define NX_NODAL_BAR_MASK PPC_BITMASK(1,14) +#define NX_NODAL_BAR_BASE PPC_BITMASK(15,32) +#define NX_GROUP_BAR0 0x06 +#define NX_GROUP_BAR1 0x07 +#define NX_GROUP_BAR_ENABLE PPC_BIT(0) +#define NX_GROUP_BAR_MASK PPC_BITMASK(1,14) +#define NX_GROUP_BAR_BASE PPC_BITMASK(15,32) +#define NX_EPSILON_COUN 0x08 +#define NX_EPSILON_COUN_DISABLE PPC_BIT(6) +#define NX_MISC_CONTROL 0x09 +#define NX_PB_DEBUG 0x0a +#define NX_PB_ECC 0x0b +#define NX_DEBUG_SNAPSHOT_0 0x0c +#define NX_DEBUG_SNAPSHOT_1 0x0d +#define NX_CS_CTL 0x0e +#define NX_CONFIG_CQ 0x0f +#define NX_MRBO0 0x10 +#define NX_MRBO1 0x11 +#define NX_AS_CMD_CFG 0x12 +#define NX_NP_BUID 0x13 +#define NP_BUID_ENABLE PPC_BIT(0) +#define NP_BUID_BASE PPC_BITMASK(1,23) +#define NX_TL_CMD_CR 0x20 +#define NX_TL_CMD_D_CR 0x21 +#define NX_TL_RSP_CR 0x22 +#define NX_TL_RSP_D_CR 0x23 +#define NX_DL_REG_ADDR 0x24 +#define NX_DL_REG_DATA 0x25 +#define NX_NTL_CONTROL 0x26 +#define NX_NTL_PMU_CONTROL 0x27 +#define NX_NTL_PMU_COUNT 0x28 +#define NX_NTL_ER_HOLD 0x29 +#define NX_NTL_FST_ERR 0x2a +#define NX_NTL_ECC 0x2b +#define NX_NTL_FST_MSK 0x2c + +/* NP AT register */ +#define NX_FIR 0x00 +#define NX_FIR_CLEAR 0x01 +#define NX_FIR_SET 0x02 +#define NX_FIR_MASK 0x03 +#define NX_FIR_MASK_CLR 0x04 +#define NX_FIR_MASK_SET 0x05 +#define NX_FIR_ACTION0 0x06 +#define NX_FIR_ACTION1 0x07 +#define NX_FIR_WOF 0x08 +#define NX_AT_PMU_CTRL 0x26 +#define NX_AT_PMU_CNT 0x27 +#define NX_AT_ERR_HOLD 0x28 +#define NX_AT_ERR_HOLD_RESET PPC_BIT(63) +#define NX_AT_DEBUG 0x29 +#define NX_AT_ECC 0x2a +#define NX_BAR 0x2b + +/* AT MMIO registers */ +#define NPU_LSI_SOURCE_ID 0x00100 +#define NPU_LSI_SRC_ID_BASE PPC_BITMASK(5,11) +#define NPU_DMA_CHAN_STATUS 0x00110 +#define NPU_INTREP_TIMER 0x001f8 +#define NPU_DMARD_SYNC 0x00200 +#define NPU_DMARD_SYNC_START_RD PPC_BIT(0) +#define NPU_DMARD_SYNC_RD PPC_BIT(1) +#define NPU_DMARD_SYNC_START_WR PPC_BIT(2) +#define NPU_DMARD_SYNC_WR PPC_BIT(3) +#define NPU_TCE_KILL 0x00210 +#define NPU_IODA_ADDR 0x00220 +#define NPU_IODA_AD_AUTOINC PPC_BIT(0) +#define NPU_IODA_AD_TSEL PPC_BITMASK(11,15) +#define NPU_IODA_AD_TADR PPC_BITMASK(54,63) +#define NPU_IODA_DATA0 0x00228 +#define NPU_XIVE_UPD 0x00248 +#define NPU_GEN_CAP 0x00250 +#define NPU_TCE_CAP 0x00258 +#define NPU_INT_CAP 0x00260 +#define NPU_EEH_CAP 0x00268 +#define NPU_VR 0x00800 +#define NPU_CTRLR 0x00810 +#define NPU_TCR 0x00880 +#define NPU_Q_DMA_R 0x00888 +#define NPU_AT_ESR 0x00c80 +#define NPU_AT_FESR 0x00c88 +#define NPU_AT_LR_ER 0x00c98 +#define NPU_AT_SI_ER 0x00ca0 +#define NPU_AT_FR_ER 0x00ca8 +#define NPU_AT_FE_ER 0x00cb0 +#define NPU_AT_ESMR 0x00cd0 +#define NPU_AT_FESMR 0x00cd8 +#define NPU_AT_I_LR0 0x00d00 +#define NPU_AT_I_LR1 0x00d08 +#define NPU_AT_I_LR2 0x00d10 +#define NPU_AT_I_LR3 0x00d18 + +/* AT */ +#define NPU_AT_SCOM_OFFSET 0x180 + +/* NTL */ +#define TL_CMD_CR 0x10000 +#define TL_CMD_D_CR 0x10008 +#define TL_RSP_CR 0x10010 +#define TL_RSP_D_CR 0x10018 +#define NTL_CONTROL 0x10020 +#define NTL_CONTROL_RESET PPC_BIT(0) + +/* IODA tables */ +#define NPU_IODA_TBL_LIST 1 +#define NPU_IODA_TBL_LXIVT 2 +#define NPU_IODA_TBL_PCT 4 +#define NPU_IODA_TBL_PESTB 8 +#define NPU_IODA_TBL_TVT 9 +#define NPU_IODA_TBL_TCD 10 +#define NPU_IODA_TBL_TDR 11 +#define NPU_IODA_TBL_PESTB_ADDR 12 +#define NPU_IODA_TBL_EA 16 + +/* LXIVT */ +#define NPU_IODA_LXIVT_SERVER PPC_BITMASK(8,23) +#define NPU_IODA_LXIVT_PRIORITY PPC_BITMASK(24,31) + +/* PCT */ +#define NPU_IODA_PCT_LINK_ENABLED PPC_BIT(0) +#define NPU_IODA_PCT_PE PPC_BITMASK(2,3) + +/* TVT */ +#define NPU_IODA_TVT_TTA PPC_BITMASK(0,47) +#define NPU_IODA_TVT_LEVELS PPC_BITMASK(48,50) +#define NPU_IODA_TVE_1_LEVEL 0 +#define NPU_IODA_TVE_2_LEVELS 1 +#define NPU_IODA_TVE_3_LEVELS 2 +#define NPU_IODA_TVE_4_LEVELS 3 +#define NPU_IODA_TVT_SIZE PPC_BITMASK(51,55) +#define NPU_IODA_TVT_PSIZE PPC_BITMASK(59,63) + +/* NDL Registers */ +#define NDL_STATUS 0xfff0 +#define NDL_CONTROL 0xfff4 + +/* BAR Sizes */ +#define NX_MMIO_PL_SIZE 0x200000 +#define NX_MMIO_AT_SIZE 0x10000 +#define NX_MMIO_DL_SIZE 0x20000 + +/* Translates a PHY SCOM address to an MMIO offset */ +#define PL_MMIO_ADDR(reg) (((reg >> 32) & 0xfffffull) << 1) + +/* PHY register scom offsets & fields */ +#define RX_PR_CNTL_PL 0x0002180000000000 +#define RX_PR_RESET PPC_BIT(63) + +#define TX_MODE1_PL 0x0004040000000000 +#define TX_LANE_PDWN PPC_BIT(48) + +#define TX_MODE2_PL 0x00040c0000000000 +#define TX_RXCAL PPC_BIT(57) +#define TX_UNLOAD_CLK_DISABLE PPC_BIT(56) + +#define TX_CNTL_STAT2 0x00041c0000000000 +#define TX_FIFO_INIT PPC_BIT(48) + +#define RX_BANK_CONTROLS 0x0000f80000000000 +#define RX_LANE_ANA_PDWN PPC_BIT(54) + +#define RX_MODE 0x0002000000000000 +#define RX_LANE_DIG_PDWN PPC_BIT(48) + +#define RX_PR_MODE 0x0002100000000000 +#define RX_PR_PHASE_STEP PPC_BITMASK(60, 63) + +#define RX_A_DAC_CNTL 0x0000080000000000 +#define RX_PR_IQ_RES_SEL PPC_BITMASK(58, 60) + +#define RX_LANE_BUSY_VEC_0_15 0x000b000000000000 +#define TX_FFE_TOTAL_2RSTEP_EN 0x000c240000000000 +#define TX_FFE_TOTAL_ENABLE_P_ENC PPC_BITMASK(49,55) +#define TX_FFE_TOTAL_ENABLE_N_ENC PPC_BITMASK(57,63) +#define TX_FFE_PRE_2RSTEP_SEL 0x000c2c0000000000 +#define TX_FFE_PRE_P_SEL_ENC PPC_BITMASK(51,54) +#define TX_FFE_PRE_N_SEL_ENC PPC_BITMASK(59,62) +#define TX_FFE_MARGIN_2RSTEP_SEL 0x000c34000000000 +#define TX_FFE_MARGIN_PU_P_SEL_ENC PPC_BITMASK(51,55) +#define TX_FFE_MARGIN_PD_N_SEL_ENC PPC_BITMASK(59,63) +#define TX_IORESET_VEC_0_15 0x000d2c0000000000 +#define TX_IMPCAL_PB 0x000f040000000000 +#define TX_ZCAL_REQ PPC_BIT(49) +#define TX_ZCAL_DONE PPC_BIT(50) +#define TX_ZCAL_ERROR PPC_BIT(51) +#define TX_IMPCAL_NVAL_PB 0x000f0c0000000000 +#define TX_ZCAL_N PPC_BITMASK(48,56) +#define TX_IMPCAL_PVAL_PB 0x000f140000000000 +#define TX_ZCAL_P PPC_BITMASK(48,56) +#define RX_EO_STEP_CNTL_PG 0x0008300000000000 +#define RX_EO_ENABLE_LATCH_OFFSET_CAL PPC_BIT(48) +#define RX_EO_ENABLE_CM_COARSE_CAL PPC_BIT(57) +#define RX_RUN_LANE_VEC_0_15 0x0009b80000000000 +#define RX_RECAL_ABORT_VEC_0_15 0x0009c80000000000 +#define RX_IORESET_VEC_0_15 0x0009d80000000000 +#define RX_EO_RECAL_PG 0x000a800000000000 +#define RX_INIT_DONE_VEC_0_15 0x000ac00000000000 +#define TX_IMPCAL_SWO1_PB 0x000f240000000000 +#define TX_ZCAL_SWO_EN PPC_BIT(48) +#define TX_IMPCAL_SWO2_PB 0x000f2c0000000000 + +#endif /* __NPU_REGS_H */ diff --git a/include/npu.h b/include/npu.h new file mode 100644 index 00000000..795b7047 --- /dev/null +++ b/include/npu.h @@ -0,0 +1,211 @@ +/* Copyright 2013-2015 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NPU_H +#define __NPU_H + +/* Number of PEs supported */ +#define NPU_NUM_OF_PES 4 + +/* Each brick has 2 MMIO BARs at the maximum. BAR0 is always used to + * map the 128KB TL/DL registers. BAR1 is used to map either the PL or + * the AT registers which are not exposed to the OS. + */ +#define NPU_BRICK_NUM_OF_BARS 2 +#define NPU_BRICK_TL_BAR_SIZE 0x20000 +#define NPU_BRICK_PL_BAR_SIZE 0x200000 + +/* The config space of NPU device is emulated. We have different + * bits to represent config register properties: readonly, write- + * one-to-clear. + */ +#define NPU_DEV_CFG_NORMAL 0 +#define NPU_DEV_CFG_RDONLY 1 +#define NPU_DEV_CFG_W1CLR 2 +#define NPU_DEV_CFG_MAX 3 + +/* Bytes of the emulated NPU PCI device config space. We are + * emulating PCI express device, not legacy one + */ +#define NPU_DEV_CFG_SIZE 0x100 + +/* Interrupt mapping + * + * NPU PHB doesn't support MSI interrupts. It only supports + * 8 LSI interrupts: [0, 3] for bricks' DL blocks. [4, 5] + * for reporting errors from DL blocks. [6, 7] for reporting + * errors from TL blocks, NPCQs and AT. + */ +#define NPU_LSI_IRQ_COUNT 8 +#define NPU_LSI_INT_DL0 0 +#define NPU_LSI_INT_DL1 1 +#define NPU_LSI_INT_DL2 2 +#define NPU_LSI_INT_DL3 3 +#define NPU_LSI_IRQ_MIN 0x7F0 +#define NPU_LSI_IRQ_MAX (NPU_LSI_IRQ_MIN + NPU_LSI_IRQ_COUNT - 1) +#define NPU_LSI_IRQ_BASE(chip, phb) (P8_CHIP_IRQ_PHB_BASE(chip, phb) | NPU_LSI_IRQ_MIN) +#define NPU_IRQ_NUM(irq) (irq & 0x7FF) + +/* NPU device capability descriptor. All PCI capabilities is + * organized as linked list. Each PCI capability has specific + * hook to populate when initializing NPU device. + */ +struct npu_dev; +struct npu_dev_cap { + uint16_t id; + uint16_t start; + uint16_t end; + struct npu_dev *dev; + void (*populate)(struct npu_dev_cap *cap); + struct list_node link; +}; + +/* Config space access trap. */ +struct npu_dev_trap { + struct npu_dev *dev; + uint32_t start; + uint32_t end; + void *data; + int64_t (*read)(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t *data); + int64_t (*write)(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t data); + struct list_node link; +}; + +struct npu_dev_bar { + uint32_t flags; + uint32_t xscom; + uint64_t base; + uint64_t size; + uint32_t bar_sz; + bool trapped; +}; + +/* Each device contains 2 links. The device will be exposed as + * standard PCIE device and the config space is emulated by skiboot. + */ +struct npu_dev { + uint32_t flags; + uint32_t index; + uint64_t xscom; + void *pl_base; + uint64_t pl_xscom_base; + struct npu_dev_bar bar; + struct phb *phb; + + /* Device and function numbers are allocated based on GPU + * association */ + uint32_t bdfn; + + /* The link@x node */ + struct dt_node *dt_node; + + /* The GPU PCI device this NPU device is associated with */ + struct pci_device *pd; + + struct npu *npu; + uint8_t *config[NPU_DEV_CFG_MAX]; + struct list_head capabilities; + struct list_head traps; + + /* Which PHY lanes this device is associated with */ + uint16_t lane_mask; + + /* Used to store the currently running procedure number for + * this device. */ + uint16_t procedure_number; + + /* Used to store the step within a procedure that we are up + * to. */ + uint16_t procedure_step; + + /* Arbitrary data used by each procedure to track status. */ + uint64_t procedure_data; + + /* Used to timeout long running procedures. */ + unsigned long procedure_tb; + + uint32_t procedure_status; +}; + +/* NPU PHB descriptor */ +struct npu { + uint32_t flags; + uint32_t index; + struct lock lock; + uint32_t chip_id; + uint64_t xscom_base; + uint64_t at_xscom; + void *at_regs; + uint32_t base_lsi; + uint64_t mm_base; + uint64_t mm_size; + uint32_t total_devices; + struct npu_dev *devices; + + /* IODA cache */ + uint64_t lxive_cache[8]; + uint64_t pce_cache[6]; + uint64_t tve_cache[NPU_NUM_OF_PES]; + + bool tx_zcal_complete[2]; + + struct phb phb; +}; + +static inline struct npu *phb_to_npu(struct phb *phb) +{ + return container_of(phb, struct npu, phb); +} + +static inline void npu_ioda_sel(struct npu *p, uint32_t table, + uint32_t addr, bool autoinc) +{ + out_be64(p->at_regs + NPU_IODA_ADDR, + (autoinc ? NPU_IODA_AD_AUTOINC : 0) | + SETFIELD(NPU_IODA_AD_TSEL, 0ul, table) | + SETFIELD(NPU_IODA_AD_TADR, 0ul, addr)); +} + +void npu_scom_init(struct npu_dev *dev); + +int64_t npu_dev_procedure_read(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t *data); + +int64_t npu_dev_procedure_write(struct npu_dev_trap *trap, + uint32_t offset, + uint32_t size, + uint32_t data); + +#define NPUDBG(p, fmt, a...) prlog(PR_DEBUG, "NPU%d: " fmt, \ + (p)->phb.opal_id, ##a) +#define NPUINF(p, fmt, a...) prlog(PR_INFO, "NPU%d: " fmt, \ + (p)->phb.opal_id, ##a) +#define NPUERR(p, fmt, a...) prlog(PR_ERR, "NPU%d: " fmt, \ + (p)->phb.opal_id, ##a) + +#define NPUDEVDBG(p, fmt, a...) NPUDBG((p)->npu, fmt, ##a) +#define NPUDEVINF(p, fmt, a...) NPUINF((p)->npu, fmt, ##a) +#define NPUDEVERR(p, fmt, a...) NPUERR((p)->npu, fmt, ##a) + +#endif /* __NPU_H */ diff --git a/include/skiboot.h b/include/skiboot.h index 4eec6dbb..18db6cfc 100644 --- a/include/skiboot.h +++ b/include/skiboot.h @@ -202,6 +202,7 @@ extern void probe_p7ioc(void); extern void probe_phb3(void); extern int phb3_preload_capp_ucode(void); extern void phb3_preload_vpd(void); +extern void probe_npu(void); extern void uart_init(bool enable_interrupt); extern void homer_init(void); extern void occ_pstates_init(void); @@ -264,4 +265,3 @@ extern bool slw_timer_ok(void); extern void fake_rtc_init(void); #endif /* __SKIBOOT_H */ - |