diff options
| author | Dave Airlie <airlied@redhat.com> | 2019-10-26 05:56:57 +1000 |
|---|---|---|
| committer | Dave Airlie <airlied@redhat.com> | 2019-10-26 05:56:57 +1000 |
| commit | 3275a71e76fac5bc276f0d60e027b18c2e8d7a5b (patch) | |
| tree | f275ab1c98be91f5e0fda869819e09c05d0918ab /drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | |
| parent | 2e79e22e092acd55da0b2db066e4826d7d152c41 (diff) | |
| parent | 1cd4d9eead73c004d08a58536dc726bd172eaaec (diff) | |
| download | blackbird-op-linux-3275a71e76fac5bc276f0d60e027b18c2e8d7a5b.tar.gz blackbird-op-linux-3275a71e76fac5bc276f0d60e027b18c2e8d7a5b.zip | |
Merge tag 'drm-next-5.5-2019-10-09' of git://people.freedesktop.org/~agd5f/linux into drm-next
drm-next-5.5-2019-10-09:
amdgpu:
- Additional RAS enablement for vega20
- RAS page retirement and bad page storage in EEPROM
- No GPU reset with unrecoverable RAS errors
- Reserve vram for page tables rather than trying to evict
- Fix issues with GPU reset and xgmi hives
- DC i2c over aux fixes
- Direct submission for clears, PTE/PDE updates
- Improvements to help support recoverable GPU page faults
- Silence harmless SAD block messages
- Clean up code for creating a bo at a fixed location
- Initial DC HDCP support
- Lots of documentation fixes
- GPU reset for renoir
- Add IH clockgating support for soc15 asics
- Powerplay improvements
- DC MST cleanups
- Add support for MSI-X
- Misc cleanups and bug fixes
amdkfd:
- Query KFD device info by asic type rather than pci ids
- Add navi14 support
- Add renoir support
- Add navi12 support
- gfx10 trap handler improvements
- pasid cleanups
- Check against device cgroup
ttm:
- Return -EBUSY with pipelining with no_gpu_wait
radeon:
- Silence harmless SAD block messages
device_cgroup:
- Export devcgroup_check_permission
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191010041713.3412-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v6_1.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 48 |
1 files changed, 37 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 8502e736f721..47c4b96b14d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) RSMU_UMC_INDEX_MODE_EN, 0); } +static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev) +{ + uint32_t rsmu_umc_index; + + rsmu_umc_index = RREG32_SOC15(RSMU, 0, + mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); + return REG_GET_FIELD(rsmu_umc_index, + RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, + RSMU_UMC_INDEX_INSTANCE); +} + static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, uint32_t umc_reg_offset, unsigned long *error_count) @@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, uint32_t umc_reg_offset, uint32_t channel_index) { uint32_t lsb, mc_umc_status_addr; - uint64_t mc_umc_status, err_addr; + uint64_t mc_umc_status, err_addr, retired_page; + struct eeprom_table_record *err_rec; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); @@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, return; } + err_rec = &err_data->err_addr[err_data->err_addr_cnt]; mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); /* calculate error address if ue/ce error is detected */ @@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, err_addr &= ~((0x1ULL << lsb) - 1); /* translate umc channel address to soc pa, 3 parts are included */ - err_data->err_addr[err_data->err_addr_cnt] = - ADDR_OF_8KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - err_data->err_addr_cnt++; + retired_page = ADDR_OF_8KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + /* we only save ue error information currently, ce is skipped */ + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) + == 1) { + err_rec->address = err_addr; + /* page frame address is saved */ + err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + err_rec->ts = (uint64_t)ktime_get_real_seconds(); + err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + err_rec->cu = 0; + err_rec->mem_channel = channel_index; + err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); + + err_data->err_addr_cnt++; + } } /* clear umc status */ @@ -209,7 +234,7 @@ static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev, amdgpu_umc_for_each_channel(umc_v6_1_query_error_address); } -static void umc_v6_1_ras_init_per_channel(struct amdgpu_device *adev, +static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, struct ras_err_data *err_data, uint32_t umc_reg_offset, uint32_t channel_index) { @@ -239,15 +264,16 @@ static void umc_v6_1_ras_init_per_channel(struct amdgpu_device *adev, WREG32(ecc_err_cnt_addr + umc_reg_offset, UMC_V6_1_CE_CNT_INIT); } -static void umc_v6_1_ras_init(struct amdgpu_device *adev) +static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev) { void *ras_error_status = NULL; - amdgpu_umc_for_each_channel(umc_v6_1_ras_init_per_channel); + amdgpu_umc_for_each_channel(umc_v6_1_err_cnt_init_per_channel); } const struct amdgpu_umc_funcs umc_v6_1_funcs = { - .ras_init = umc_v6_1_ras_init, + .err_cnt_init = umc_v6_1_err_cnt_init, + .ras_late_init = amdgpu_umc_ras_late_init, .query_ras_error_count = umc_v6_1_query_ras_error_count, .query_ras_error_address = umc_v6_1_query_ras_error_address, .enable_umc_index_mode = umc_v6_1_enable_umc_index_mode, |

