diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 710 |
1 files changed, 557 insertions, 153 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a7f893cf724..39cd545976b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,16 +65,23 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include <linux/suspend.h> +#include <drm/task_barrier.h> + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 -static const char *amdgpu_asic_name[] = { +const char *amdgpu_asic_name[] = { "TAHITI", "PITCAIRN", "VERDE", @@ -98,7 +105,11 @@ static const char *amdgpu_asic_name[] = { "VEGA12", "VEGA20", "RAVEN", + "ARCTURUS", + "RENOIR", "NAVI10", + "NAVI14", + "NAVI12", "LAST", }; @@ -127,14 +138,14 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); /** - * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control + * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control * * @dev: drm_device pointer * * Returns true if the device is a dGPU with HG/PX power control, * otherwise return false. */ -bool amdgpu_device_is_px(struct drm_device *dev) +bool amdgpu_device_supports_boco(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; @@ -143,6 +154,51 @@ bool amdgpu_device_is_px(struct drm_device *dev) return false; } +/** + * amdgpu_device_supports_baco - Does the device support BACO + * + * @dev: drm_device pointer + * + * Returns true if the device supporte BACO, + * otherwise return false. + */ +bool amdgpu_device_supports_baco(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + + return amdgpu_asic_supports_baco(adev); +} + +/** + * VRAM access helper functions. + * + * amdgpu_device_vram_access - read/write a buffer in vram + * + * @adev: amdgpu_device pointer + * @pos: offset of the buffer in vram + * @buf: virtual address of the buffer in system memory + * @size: read/write size, sizeof(@buf) must > @size + * @write: true - write to vram, otherwise - read from vram + */ +void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, + uint32_t *buf, size_t size, bool write) +{ + uint64_t last; + unsigned long flags; + + last = size - 4; + for (last += pos; pos <= last; pos += 4) { + spin_lock_irqsave(&adev->mmio_idx_lock, flags); + WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); + WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); + if (write) + WREG32_NO_KIQ(mmMM_DATA, *buf++); + else + *buf++ = RREG32_NO_KIQ(mmMM_DATA); + spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); + } +} + /* * MMIO register access helper functions. */ @@ -160,8 +216,8 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_virt_kiq_rreg(adev, reg); + if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + return amdgpu_kiq_rreg(adev, reg); if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -238,8 +294,8 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, adev->last_mm_index = v; } - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_virt_kiq_wreg(adev, reg, v); + if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + return amdgpu_kiq_wreg(adev, reg, v); if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); @@ -413,6 +469,40 @@ static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32 } /** + * amdgpu_invalid_rreg64 - dummy 64 bit reg read function + * + * @adev: amdgpu device pointer + * @reg: offset of register + * + * Dummy register read function. Used for register blocks + * that certain asics don't have (all asics). + * Returns the value in the register. + */ +static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) +{ + DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); + BUG(); + return 0; +} + +/** + * amdgpu_invalid_wreg64 - dummy reg write function + * + * @adev: amdgpu device pointer + * @reg: offset of register + * @v: value to write to the register + * + * Dummy register read function. Used for register blocks + * that certain asics don't have (all asics). + */ +static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) +{ + DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", + reg, v); + BUG(); +} + +/** * amdgpu_block_invalid_rreg - dummy reg read function * * @adev: amdgpu device pointer @@ -895,7 +985,7 @@ static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) { struct sysinfo si; - bool is_os_64 = (sizeof(void *) == 8) ? true : false; + bool is_os_64 = (sizeof(void *) == 8); uint64_t total_memory; uint64_t dram_size_seven_GB = 0x1B8000000; uint64_t dram_size_three_GB = 0xB8000000; @@ -942,8 +1032,6 @@ def_value: */ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) { - int ret = 0; - if (amdgpu_sched_jobs < 4) { dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); @@ -981,15 +1069,9 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_device_check_block_size(adev); - ret = amdgpu_device_get_job_timeout_settings(adev); - if (ret) { - dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return ret; - } - adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); - return ret; + return 0; } /** @@ -1004,8 +1086,9 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) { struct drm_device *dev = pci_get_drvdata(pdev); + int r; - if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) + if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF) return; if (state == VGA_SWITCHEROO_ON) { @@ -1013,7 +1096,12 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_resume(dev, true, true); + pci_set_power_state(dev->pdev, PCI_D0); + pci_restore_state(dev->pdev); + r = pci_enable_device(dev->pdev); + if (r) + DRM_WARN("pci_enable_device failed (%d)\n", r); + amdgpu_device_resume(dev, true); dev->switch_power_state = DRM_SWITCH_POWER_ON; drm_kms_helper_poll_enable(dev); @@ -1021,7 +1109,11 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero pr_info("amdgpu: switched off\n"); drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; - amdgpu_device_suspend(dev, true, true); + amdgpu_device_suspend(dev, true); + pci_save_state(dev->pdev); + /* Shut down the device */ + pci_disable_device(dev->pdev); + pci_set_power_state(dev->pdev, PCI_D3cold); dev->switch_power_state = DRM_SWITCH_POWER_OFF; } } @@ -1384,9 +1476,21 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) else chip_name = "raven"; break; + case CHIP_ARCTURUS: + chip_name = "arcturus"; + break; + case CHIP_RENOIR: + chip_name = "renoir"; + break; case CHIP_NAVI10: chip_name = "navi10"; break; + case CHIP_NAVI14: + chip_name = "navi14"; + break; + case CHIP_NAVI12: + chip_name = "navi12"; + break; } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); @@ -1415,6 +1519,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); + if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + goto parse_soc_bounding_box; + adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); @@ -1442,14 +1549,18 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->gfx.config.num_packer_per_sc = le32_to_cpu(gpu_info_fw->num_packer_per_sc); } -#ifdef CONFIG_DRM_AMD_DC_DCN2_0 + +parse_soc_bounding_box: + /* + * soc bounding box info is not integrated in disocovery table, + * we always need to parse it from gpu info firmware. + */ if (hdr->version_minor == 2) { const struct gpu_info_firmware_v1_2 *gpu_info_fw = (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; } -#endif break; } default: @@ -1529,7 +1640,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: - if (adev->asic_type == CHIP_RAVEN) + case CHIP_ARCTURUS: + case CHIP_RENOIR: + if (adev->asic_type == CHIP_RAVEN || + adev->asic_type == CHIP_RENOIR) adev->family = AMDGPU_FAMILY_RV; else adev->family = AMDGPU_FAMILY_AI; @@ -1539,6 +1653,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) return r; break; case CHIP_NAVI10: + case CHIP_NAVI14: + case CHIP_NAVI12: adev->family = AMDGPU_FAMILY_NV; r = nv_set_ip_blocks(adev); @@ -1554,19 +1670,19 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (r) return r; + if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + amdgpu_discovery_get_gfx_info(adev); + amdgpu_amdkfd_device_probe(adev); if (amdgpu_sriov_vf(adev)) { r = amdgpu_virt_request_full_gpu(adev, true); if (r) return -EAGAIN; - - /* query the reg access mode at the very beginning */ - amdgpu_virt_init_reg_access_mode(adev); } adev->pm.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { @@ -1665,29 +1781,36 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->asic_type >= CHIP_VEGA10) { for (i = 0; i < adev->num_ip_blocks; i++) { - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { - if (adev->in_gpu_reset || adev->in_suspend) { - if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) - break; /* sriov gpu reset, psp need to do hw_init before IH because of hw limit */ - r = adev->ip_blocks[i].version->funcs->resume(adev); - if (r) { - DRM_ERROR("resume of IP block <%s> failed %d\n", + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) + continue; + + /* no need to do the fw loading again if already done*/ + if (adev->ip_blocks[i].status.hw == true) + break; + + if (adev->in_gpu_reset || adev->in_suspend) { + r = adev->ip_blocks[i].version->funcs->resume(adev); + if (r) { + DRM_ERROR("resume of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); - return r; - } - } else { - r = adev->ip_blocks[i].version->funcs->hw_init(adev); - if (r) { - DRM_ERROR("hw_init of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - return r; - } + return r; + } + } else { + r = adev->ip_blocks[i].version->funcs->hw_init(adev); + if (r) { + DRM_ERROR("hw_init of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; } - adev->ip_blocks[i].status.hw = true; } + + adev->ip_blocks[i].status.hw = true; + break; } } - r = amdgpu_pm_load_smu_firmware(adev, &smu_version); + + if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) + r = amdgpu_pm_load_smu_firmware(adev, &smu_version); return r; } @@ -1754,6 +1877,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } } + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_init_data_exchange(adev); + r = amdgpu_ib_pool_init(adev); if (r) { dev_err(adev->dev, "IB initialization failed (%d).\n", r); @@ -1777,16 +1903,26 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; + /* + * retired pages will be loaded from eeprom and reserved here, + * it should be called after amdgpu_device_ip_hw_init_phase2 since + * for some ASICs the RAS EEPROM code relies on SMU fully functioning + * for I2C communication which only true at this point. + * recovery_init may fail, but it can free all resources allocated by + * itself and its failure should not stop amdgpu init process. + * + * Note: theoretically, this should be called before all vram allocations + * to protect retired page from abusing + */ + amdgpu_ras_recovery_init(adev); + if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); amdgpu_amdkfd_device_init(adev); init_failed: - if (amdgpu_sriov_vf(adev)) { - if (!r) - amdgpu_virt_init_data_exchange(adev); + if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); - } return r; } @@ -1825,6 +1961,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * amdgpu_device_set_cg_state - set clockgating for amdgpu device * * @adev: amdgpu_device pointer + * @state: clockgating state (gate or ungate) * * The list of all the hardware IPs that make up the asic is walked and the * set_clockgating_state callbacks are run. @@ -1849,6 +1986,7 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_clockgating_state) { /* enable clockgating to save power */ r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, @@ -1879,6 +2017,7 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, @@ -1944,6 +2083,7 @@ out: */ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) { + struct amdgpu_gpu_instance *gpu_instance; int i = 0, r; for (i = 0; i < adev->num_ip_blocks; i++) { @@ -1969,8 +2109,39 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (r) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); - /* set to low pstate by default */ - amdgpu_xgmi_set_pstate(adev, 0); + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + mutex_lock(&mgpu_info.mutex); + + /* + * Reset device p-state to low as this was booted with high. + * + * This should be performed only after all devices from the same + * hive get initialized. + * + * However, it's unknown how many device in the hive in advance. + * As this is counted one by one during devices initializations. + * + * So, we wait for all XGMI interlinked devices initialized. + * This may bring some delays as those devices may come from + * different hives. But that should be OK. + */ + if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { + for (i = 0; i < mgpu_info.num_gpu; i++) { + gpu_instance = &(mgpu_info.gpu_ins[i]); + if (gpu_instance->adev->flags & AMD_IS_APU) + continue; + + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); + if (r) { + DRM_ERROR("pstate setting failed (%d).\n", r); + break; + } + } + } + + mutex_unlock(&mgpu_info.mutex); + } return 0; } @@ -2128,7 +2299,9 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) if (r) { DRM_ERROR("suspend of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); + return r; } + adev->ip_blocks[i].status.hw = false; } } @@ -2156,6 +2329,12 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* displays are handled in phase1 */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) continue; + /* PSP lost connection when err_event_athub occurs */ + if (amdgpu_ras_intr_triggered() && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ @@ -2163,6 +2342,18 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) DRM_ERROR("suspend of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); } + adev->ip_blocks[i].status.hw = false; + /* handle putting the SMC in the appropriate state */ + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { + r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); + if (r) { + DRM_ERROR("SMC failed to set mp1 state %d, %d\n", + adev->mp1_state, r); + return r; + } + } + + adev->ip_blocks[i].status.hw = false; } return 0; @@ -2215,6 +2406,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) for (j = 0; j < adev->num_ip_blocks; j++) { block = &adev->ip_blocks[j]; + block->status.hw = false; if (block->version->type != ip_order[i] || !block->status.valid) continue; @@ -2223,6 +2415,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); if (r) return r; + block->status.hw = true; } } @@ -2239,7 +2432,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_GFX, AMD_IP_BLOCK_TYPE_SDMA, AMD_IP_BLOCK_TYPE_UVD, - AMD_IP_BLOCK_TYPE_VCE + AMD_IP_BLOCK_TYPE_VCE, + AMD_IP_BLOCK_TYPE_VCN }; for (i = 0; i < ARRAY_SIZE(ip_order); i++) { @@ -2250,13 +2444,19 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) block = &adev->ip_blocks[j]; if (block->version->type != ip_order[i] || - !block->status.valid) + !block->status.valid || + block->status.hw) continue; - r = block->version->funcs->hw_init(adev); + if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) + r = block->version->funcs->resume(adev); + else + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); if (r) return r; + block->status.hw = true; } } @@ -2280,17 +2480,19 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) int i, r; for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue; if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { + r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); return r; } + adev->ip_blocks[i].status.hw = true; } } @@ -2315,7 +2517,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) int i, r; for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) + if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue; if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || @@ -2328,6 +2530,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) adev->ip_blocks[i].version->funcs->name, r); return r; } + adev->ip_blocks[i].status.hw = true; } return 0; @@ -2421,15 +2624,19 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: -#if defined(CONFIG_DRM_AMD_DC_DCN1_0) +#if defined(CONFIG_DRM_AMD_DC_DCN) case CHIP_RAVEN: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN2_0) case CHIP_NAVI10: + case CHIP_NAVI14: + case CHIP_NAVI12: + case CHIP_RENOIR: #endif return amdgpu_dc != 0; #endif default: + if (amdgpu_dc > 0) + DRM_INFO("Display Core has been requested via kernel parameter " + "but isn't supported by ASIC, ignoring\n"); return false; } } @@ -2454,13 +2661,110 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + + /* It's a bug to not have a hive within this function */ + if (WARN_ON(!hive)) + return; + + /* + * Use task barrier to synchronize all xgmi reset works across the + * hive. task_barrier_enter and task_barrier_exit will block + * until all the threads running the xgmi reset works reach + * those points. task_barrier_full will do both blocks. + */ + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { + + task_barrier_enter(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); + + if (adev->asic_reset_res) + goto fail; - adev->asic_reset_res = amdgpu_asic_reset(adev); + task_barrier_exit(&hive->tb); + adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); + + if (adev->asic_reset_res) + goto fail; + } else { + + task_barrier_full(&hive->tb); + adev->asic_reset_res = amdgpu_asic_reset(adev); + } + +fail: if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", adev->asic_reset_res, adev->ddev->unique); } +static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) +{ + char *input = amdgpu_lockup_timeout; + char *timeout_setting = NULL; + int index = 0; + long timeout; + int ret = 0; + + /* + * By default timeout for non compute jobs is 10000. + * And there is no timeout enforced on compute jobs. + * In SR-IOV or passthrough mode, timeout for compute + * jobs are 10000 by default. + */ + adev->gfx_timeout = msecs_to_jiffies(10000); + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + adev->compute_timeout = adev->gfx_timeout; + else + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; + + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { + while ((timeout_setting = strsep(&input, ",")) && + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { + ret = kstrtol(timeout_setting, 0, &timeout); + if (ret) + return ret; + + if (timeout == 0) { + index++; + continue; + } else if (timeout < 0) { + timeout = MAX_SCHEDULE_TIMEOUT; + } else { + timeout = msecs_to_jiffies(timeout); + } + + switch (index++) { + case 0: + adev->gfx_timeout = timeout; + break; + case 1: + adev->compute_timeout = timeout; + break; + case 2: + adev->sdma_timeout = timeout; + break; + case 3: + adev->video_timeout = timeout; + break; + default: + break; + } + } + /* + * There is only one value specified and + * it should apply to all non-compute jobs. + */ + if (index == 1) { + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + adev->compute_timeout = adev->gfx_timeout; + } + } + + return ret; +} /** * amdgpu_device_init - initialize the driver @@ -2480,7 +2784,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, uint32_t flags) { int r, i; - bool runtime = false; + bool boco = false; u32 max_MBps; adev->shutdown = false; @@ -2488,7 +2792,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ddev = ddev; adev->pdev = pdev; adev->flags = flags; - adev->asic_type = flags & AMD_ASIC_MASK; + + if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) + adev->asic_type = amdgpu_force_asic_type; + else + adev->asic_type = flags & AMD_ASIC_MASK; + adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; if (amdgpu_emu_mode == 1) adev->usec_timeout *= 2; @@ -2498,7 +2807,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->mman.buffer_funcs = NULL; adev->mman.buffer_funcs_ring = NULL; adev->vm_manager.vm_pte_funcs = NULL; - adev->vm_manager.vm_pte_num_rqs = 0; + adev->vm_manager.vm_pte_num_scheds = 0; adev->gmc.gmc_funcs = NULL; adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); @@ -2509,6 +2818,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->pcie_wreg = &amdgpu_invalid_wreg; adev->pciep_rreg = &amdgpu_invalid_rreg; adev->pciep_wreg = &amdgpu_invalid_wreg; + adev->pcie_rreg64 = &amdgpu_invalid_rreg64; + adev->pcie_wreg64 = &amdgpu_invalid_wreg64; adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; adev->didt_rreg = &amdgpu_invalid_rreg; @@ -2536,8 +2847,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); - mutex_init(&adev->virt.dpm_mutex); mutex_init(&adev->psp.mutex); + mutex_init(&adev->notifier_lock); r = amdgpu_device_check_arguments(adev); if (r) @@ -2629,6 +2940,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + r = amdgpu_device_get_job_timeout_settings(adev); + if (r) { + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); + return r; + } + /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -2637,12 +2954,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, * ignore it */ vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - if (amdgpu_device_is_px(ddev)) - runtime = true; - if (!pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_device_supports_boco(ddev)) + boco = true; + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, runtime); - if (runtime) + &amdgpu_switcheroo_ops, boco); + if (boco) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); if (amdgpu_emu_mode == 1) { @@ -2729,11 +3049,17 @@ fence_driver_init: } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); - if (amdgpu_virt_request_full_gpu(adev, false)) - amdgpu_virt_release_full_gpu(adev, false); goto failed; } + DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", + adev->gfx.config.max_shader_engines, + adev->gfx.config.max_sh_per_se, + adev->gfx.config.max_cu_per_sh, + adev->gfx.cu_info.number); + + amdgpu_ctx_init_sched(adev); + adev->accel_working = true; amdgpu_vm_check_compute_bug(adev); @@ -2748,16 +3074,19 @@ fence_driver_init: amdgpu_fbdev_init(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_init(adev); - r = amdgpu_pm_sysfs_init(adev); - if (r) + if (r) { + adev->pm_sysfs_en = false; DRM_ERROR("registering pm debugfs failed (%d).\n", r); + } else + adev->pm_sysfs_en = true; r = amdgpu_ucode_sysfs_init(adev); - if (r) + if (r) { + adev->ucode_sysfs_en = false; DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); + } else + adev->ucode_sysfs_en = true; r = amdgpu_debugfs_gem_init(adev); if (r) @@ -2788,6 +3117,13 @@ fence_driver_init: DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); } + /* + * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. + * Otherwise the mgpu fan boost feature will be skipped due to the + * gpu instance is counted less. + */ + amdgpu_register_gpu_instance(adev); + /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ @@ -2819,7 +3155,7 @@ fence_driver_init: failed: amdgpu_vf_error_trans_all(adev); - if (runtime) + if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); return r; @@ -2838,7 +3174,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) int r; DRM_INFO("amdgpu: finishing device.\n"); + flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + /* disable all interrupts */ amdgpu_irq_disable_all(adev); if (adev->mode_info.mode_config_initialized){ @@ -2848,7 +3186,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) drm_atomic_helper_shutdown(adev->ddev); } amdgpu_fence_driver_fini(adev); - amdgpu_pm_sysfs_fini(adev); + if (adev->pm_sysfs_en) + amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_device_ip_fini(adev); if (adev->firmware.gpu_info_fw) { @@ -2856,7 +3195,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; } adev->accel_working = false; - cancel_delayed_work_sync(&adev->delayed_init_work); /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) amdgpu_i2c_fini(adev); @@ -2866,9 +3204,12 @@ void amdgpu_device_fini(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (!pci_is_thunderbolt_attached(adev->pdev)) + if (amdgpu_has_atpx() && + (amdgpu_is_atpx_hybrid() || + amdgpu_has_atpx_dgpu_power_cntl()) && + !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_unregister_client(adev->pdev); - if (adev->flags & AMD_IS_PX) + if (amdgpu_device_supports_boco(adev->ddev)) vga_switcheroo_fini_domain_pm_ops(adev->dev); vga_client_register(adev->pdev, NULL, NULL, NULL); if (adev->rio_mem) @@ -2877,12 +3218,11 @@ void amdgpu_device_fini(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev); - if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) - amdgpu_pm_virt_sysfs_fini(adev); amdgpu_debugfs_regs_cleanup(adev); device_remove_file(adev->dev, &dev_attr_pcie_replay_count); - amdgpu_ucode_sysfs_fini(adev); + if (adev->ucode_sysfs_en) + amdgpu_ucode_sysfs_fini(adev); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); amdgpu_debugfs_preempt_cleanup(adev); @@ -2905,11 +3245,12 @@ void amdgpu_device_fini(struct amdgpu_device *adev) * Returns 0 for success or an error on failure. * Called at driver suspend. */ -int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) +int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { struct amdgpu_device *adev; struct drm_crtc *crtc; struct drm_connector *connector; + struct drm_connector_list_iter iter; int r; if (dev == NULL || dev->dev_private == NULL) { @@ -2932,9 +3273,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) if (!amdgpu_device_has_dc_support(adev)) { /* turn off display hw */ drm_modeset_lock_all(dev); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); - } + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) + drm_helper_connector_dpms(connector, + DRM_MODE_DPMS_OFF); + drm_connector_list_iter_end(&iter); drm_modeset_unlock_all(dev); /* unpin the front buffers and cursors */ list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { @@ -2985,17 +3328,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) */ amdgpu_bo_evict_vram(adev); - pci_save_state(dev->pdev); - if (suspend) { - /* Shut down the device */ - pci_disable_device(dev->pdev); - pci_set_power_state(dev->pdev, PCI_D3hot); - } else { - r = amdgpu_asic_reset(adev); - if (r) - DRM_ERROR("amdgpu asic reset failed\n"); - } - return 0; } @@ -3010,9 +3342,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) * Returns 0 for success or an error on failure. * Called at driver resume. */ -int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) +int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_device *adev = dev->dev_private; struct drm_crtc *crtc; int r = 0; @@ -3020,14 +3353,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - if (resume) { - pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); - r = pci_enable_device(dev->pdev); - if (r) - return r; - } - /* post card */ if (amdgpu_device_need_post(adev)) { r = amdgpu_atom_asic_init(adev->mode_info.atom_context); @@ -3083,9 +3408,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) /* turn on display hw */ drm_modeset_lock_all(dev); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); - } + + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) + drm_helper_connector_dpms(connector, + DRM_MODE_DPMS_ON); + drm_connector_list_iter_end(&iter); + drm_modeset_unlock_all(dev); } amdgpu_fbdev_set_suspend(adev, 0); @@ -3362,13 +3691,12 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) goto error; + amdgpu_virt_init_data_exchange(adev); /* we need recover gart prior to run SMC/CP/SDMA resume */ amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); @@ -3386,10 +3714,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_amdkfd_post_reset(adev); error: - amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { - atomic_inc(&adev->vram_lost_counter); + amdgpu_inc_vram_lost(adev); r = amdgpu_device_recover_vram(adev); } @@ -3431,6 +3758,12 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_VEGA20: case CHIP_VEGA10: case CHIP_VEGA12: + case CHIP_RAVEN: + case CHIP_ARCTURUS: + case CHIP_RENOIR: + case CHIP_NAVI10: + case CHIP_NAVI14: + case CHIP_NAVI12: break; default: goto disabled; @@ -3507,7 +3840,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else r = amdgpu_asic_reset(tmp_adev); @@ -3519,7 +3852,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - /* For XGMI wait for all PSP resets to complete before proceed */ + /* For XGMI wait for all resets to complete before proceed */ if (!r) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { @@ -3530,14 +3863,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, break; } } - - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - amdgpu_ras_reserve_bad_pages(tmp_adev); - } } } + if (!r && amdgpu_ras_intr_triggered()) + amdgpu_ras_intr_cleared(); list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { @@ -3554,7 +3884,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, vram_lost = amdgpu_device_check_vram_lost(tmp_adev); if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); - atomic_inc(&tmp_adev->vram_lost_counter); + amdgpu_inc_vram_lost(tmp_adev); } r = amdgpu_gtt_mgr_recover( @@ -3626,25 +3956,30 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) mutex_lock(&adev->lock_reset); atomic_inc(&adev->gpu_reset_counter); - adev->in_gpu_reset = 1; - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); + adev->in_gpu_reset = true; + switch (amdgpu_asic_reset_method(adev)) { + case AMD_RESET_METHOD_MODE1: + adev->mp1_state = PP_MP1_STATE_SHUTDOWN; + break; + case AMD_RESET_METHOD_MODE2: + adev->mp1_state = PP_MP1_STATE_RESET; + break; + default: + adev->mp1_state = PP_MP1_STATE_NONE; + break; + } return true; } static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); amdgpu_vf_error_trans_all(adev); - adev->in_gpu_reset = 0; + adev->mp1_state = PP_MP1_STATE_NONE; + adev->in_gpu_reset = false; mutex_unlock(&adev->lock_reset); } - /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -3664,11 +3999,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; + bool in_ras_intr = amdgpu_ras_intr_triggered(); + bool use_baco = + (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? + true : false; + + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { + + DRM_WARN("Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU reset begin!\n"); + dev_info(adev->dev, "GPU %s begin!\n", + (in_ras_intr && !use_baco) ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3684,20 +4036,27 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (hive && !mutex_trylock(&hive->reset_lock)) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", - job->base.id, hive->hive_id); + job ? job->base.id : -1, hive->hive_id); return 0; } /* Start with adev pre asic reset first for soft reset check.*/ if (!amdgpu_device_lock_adev(adev, !hive)) { DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", - job->base.id); + job ? job->base.id : -1); return 0; } + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); + /* Build list of devices to reset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!hive) { + /*unlock kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_post_reset(adev); amdgpu_device_unlock_adev(adev); return -ENODEV; } @@ -3713,17 +4072,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, device_list_handle = &device_list; } - /* - * Mark these ASICs to be reseted as untracked first - * And add them back after reset completed - */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) - amdgpu_unregister_gpu_instance(tmp_adev); - /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (tmp_adev != adev) { + amdgpu_device_lock_adev(tmp_adev, false); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + } + + /* + * Mark these ASICs to be reseted as untracked first + * And add them back after reset completed + */ + amdgpu_unregister_gpu_instance(tmp_adev); + /* disable ras on ALL IPs */ - if (amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!(in_ras_intr && !use_baco) && + amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3732,11 +4097,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - drm_sched_stop(&ring->sched, &job->base); + drm_sched_stop(&ring->sched, job ? &job->base : NULL); + + if (in_ras_intr && !use_baco) + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } + if (in_ras_intr && !use_baco) + goto skip_sched_resume; + /* * Must check guilty signal here since after this point all old * HW fences are force signaled. @@ -3747,9 +4118,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, dma_fence_is_signaled(job->base.s_fence->parent)) job_signaled = true; - if (!amdgpu_device_ip_need_full_reset(adev)) - device_list_handle = &device_list; - if (job_signaled) { dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; @@ -3757,9 +4125,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* Guilty job will be freed after this*/ - r = amdgpu_device_pre_asic_reset(adev, - job, - &need_full_reset); + r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); if (r) { /*TODO Should we stop ?*/ DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -3773,7 +4139,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (tmp_adev == adev) continue; - amdgpu_device_lock_adev(tmp_adev, false); r = amdgpu_device_pre_asic_reset(tmp_adev, NULL, &need_full_reset); @@ -3801,6 +4166,7 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -3822,12 +4188,18 @@ skip_hw_reset: if (r) { /* bad news, how to tell it to userspace ? */ - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); } + } +skip_sched_resume: + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + /*unlock kfd: SRIOV would do it separately */ + if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } @@ -3975,3 +4347,35 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) } } +int amdgpu_device_baco_enter(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, false); + + return amdgpu_dpm_baco_enter(adev); +} + +int amdgpu_device_baco_exit(struct drm_device *dev) +{ + struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int ret = 0; + + if (!amdgpu_device_supports_baco(adev->ddev)) + return -ENOTSUPP; + + ret = amdgpu_dpm_baco_exit(adev); + if (ret) + return ret; + + if (ras && ras->supported) + adev->nbio.funcs->enable_doorbell_interrupt(adev, true); + + return 0; +} |