diff options
author | Christian König <christian.koenig@amd.com> | 2018-08-21 11:11:36 +0200 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2018-08-27 15:10:07 -0500 |
commit | 7876fa4f55fda4a57348832f4a668279ed2b2fc4 (patch) | |
tree | 3a2c8d39c5a86fff4f5f71906b26c4aa69be7a6c | |
parent | 07e6d3f03b5ff7b93af37932ee0f4b775812274f (diff) | |
download | talos-obmc-linux-7876fa4f55fda4a57348832f4a668279ed2b2fc4.tar.gz talos-obmc-linux-7876fa4f55fda4a57348832f4a668279ed2b2fc4.zip |
drm/amdgpu: add ring soft recovery v4
Instead of hammering hard on the GPU try a soft recovery first.
v2: reorder code a bit
v3: increase timeout to 10ms, increment GPU reset counter
v4: squash in compile fix (Christian)
Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Huang Rui <ray.huang@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 25 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 |
3 files changed, 35 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index facc0f08d804..34e54d41f5ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -33,6 +33,12 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_job *job = to_amdgpu_job(s_job); + if (amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { + DRM_ERROR("ring %s timeout, but soft recovered\n", + s_job->sched->name); + return; + } + DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), ring->fence_drv.sync_seq); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 5dfd26be1eec..b70e85ec147d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -383,6 +383,31 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask); } +/** + * amdgpu_ring_soft_recovery - try to soft recover a ring lockup + * + * @ring: ring to try the recovery on + * @vmid: VMID we try to get going again + * @fence: timedout fence + * + * Tries to get a ring proceeding again when it is stuck. + */ +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, + struct dma_fence *fence) +{ + ktime_t deadline = ktime_add_us(ktime_get(), 10000); + + if (!ring->funcs->soft_recovery) + return false; + + atomic_inc(&ring->adev->gpu_reset_counter); + while (!dma_fence_is_signaled(fence) && + ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) + ring->funcs->soft_recovery(ring, vmid); + + return dma_fence_is_signaled(fence); +} + /* * Debugfs info */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 409fdd9b9710..9cc239968e40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -168,6 +168,8 @@ struct amdgpu_ring_funcs { /* priority functions */ void (*set_priority) (struct amdgpu_ring *ring, enum drm_sched_priority priority); + /* Try to soft recover the ring to make the fence signal */ + void (*soft_recovery)(struct amdgpu_ring *ring, unsigned vmid); }; struct amdgpu_ring { @@ -260,6 +262,8 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring); void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring, uint32_t reg0, uint32_t val0, uint32_t reg1, uint32_t val1); +bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, + struct dma_fence *fence); static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring) { |