diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2013-06-10 11:20:21 +0100 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2013-06-11 11:49:28 +0200 |
commit | 6274f2126a0454d3c3df1bc9cc6f5e18302696f7 (patch) | |
tree | c9a41bb9c0cf3b035ee63361e03fe90f95c13abf /drivers | |
parent | 9107e9d227e3b0893829baee4ac59feb874d4c23 (diff) | |
download | blackbird-op-linux-6274f2126a0454d3c3df1bc9cc6f5e18302696f7.tar.gz blackbird-op-linux-6274f2126a0454d3c3df1bc9cc6f5e18302696f7.zip |
drm/i915: Don't count semaphore waits towards a stuck ring
If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.
This error was foretold by Ben in
commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date: Thu May 30 09:04:29 2013 +0300
drm/i915: detect hang using per ring hangcheck_score
"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."
v2: Make sure we don't even incur the KICK cost whilst waiting.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/drm/i915/i915_irq.c | 121 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/intel_ringbuffer.h | 1 |
2 files changed, 90 insertions, 32 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 6a757691488e..563abe79bb20 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2321,21 +2321,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno) i915_seqno_passed(seqno, ring_last_seqno(ring))); } -static bool semaphore_passed(struct intel_ring_buffer *ring) +static struct intel_ring_buffer * +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno) { struct drm_i915_private *dev_priv = ring->dev->dev_private; - u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR; - struct intel_ring_buffer *signaller; - u32 cmd, ipehr, acthd_min; + u32 cmd, ipehr, acthd, acthd_min; ipehr = I915_READ(RING_IPEHR(ring->mmio_base)); if ((ipehr & ~(0x3 << 16)) != (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER)) - return false; + return NULL; /* ACTHD is likely pointing to the dword after the actual command, * so scan backwards until we find the MBOX. */ + acthd = intel_ring_get_active_head(ring) & HEAD_ADDR; acthd_min = max((int)acthd - 3 * 4, 0); do { cmd = ioread32(ring->virtual_start + acthd); @@ -2344,22 +2344,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring) acthd -= 4; if (acthd < acthd_min) - return false; + return NULL; } while (1); - signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3]; - return i915_seqno_passed(signaller->get_seqno(signaller, false), - ioread32(ring->virtual_start+acthd+4)+1); + *seqno = ioread32(ring->virtual_start+acthd+4)+1; + return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3]; +} + +static int semaphore_passed(struct intel_ring_buffer *ring) +{ + struct drm_i915_private *dev_priv = ring->dev->dev_private; + struct intel_ring_buffer *signaller; + u32 seqno, ctl; + + ring->hangcheck.deadlock = true; + + signaller = semaphore_waits_for(ring, &seqno); + if (signaller == NULL || signaller->hangcheck.deadlock) + return -1; + + /* cursory check for an unkickable deadlock */ + ctl = I915_READ_CTL(signaller); + if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0) + return -1; + + return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno); +} + +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv) +{ + struct intel_ring_buffer *ring; + int i; + + for_each_ring(ring, dev_priv, i) + ring->hangcheck.deadlock = false; } -static bool ring_hung(struct intel_ring_buffer *ring) +static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd) { struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; u32 tmp; + if (ring->hangcheck.acthd != acthd) + return active; + if (IS_GEN2(dev)) - return true; + return hung; /* Is the chip hanging on a WAIT_FOR_EVENT? * If so we can simply poke the RB_WAIT bit @@ -2371,19 +2402,24 @@ static bool ring_hung(struct intel_ring_buffer *ring) DRM_ERROR("Kicking stuck wait on %s\n", ring->name); I915_WRITE_CTL(ring, tmp); - return false; - } - - if (INTEL_INFO(dev)->gen >= 6 && - tmp & RING_WAIT_SEMAPHORE && - semaphore_passed(ring)) { - DRM_ERROR("Kicking stuck semaphore on %s\n", - ring->name); - I915_WRITE_CTL(ring, tmp); - return false; + return kick; + } + + if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) { + switch (semaphore_passed(ring)) { + default: + return hung; + case 1: + DRM_ERROR("Kicking stuck semaphore on %s\n", + ring->name); + I915_WRITE_CTL(ring, tmp); + return kick; + case 0: + return wait; + } } - return true; + return hung; } /** @@ -2414,6 +2450,8 @@ void i915_hangcheck_elapsed(unsigned long data) u32 seqno, acthd; bool busy = true; + semaphore_clear_deadlocks(dev_priv); + seqno = ring->get_seqno(ring, false); acthd = intel_ring_get_active_head(ring); @@ -2430,17 +2468,36 @@ void i915_hangcheck_elapsed(unsigned long data) } else { int score; - stuck[i] = ring->hangcheck.acthd == acthd; - if (stuck[i]) { - /* Every time we kick the ring, add a - * small increment to the hangcheck - * score so that we can catch a - * batch that is repeatedly kicked. - */ - score = ring_hung(ring) ? HUNG : KICK; - } else + /* We always increment the hangcheck score + * if the ring is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * ring is in a legitimate wait for another + * ring. In that case the waiting ring is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, add a small increment to the + * score so that we can catch a batch that is + * being repeatedly kicked and so responsible + * for stalling the machine. + */ + switch (ring_stuck(ring, acthd)) { + case wait: + score = 0; + break; + case active: score = BUSY; - + break; + case kick: + score = KICK; + break; + case hung: + score = HUNG; + stuck[i] = true; + break; + } ring->hangcheck.score += score; } } else { diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index efc403d1e3e0..a3e96103dbe5 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -38,6 +38,7 @@ struct intel_hw_status_page { #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base)) struct intel_ring_hangcheck { + bool deadlock; u32 seqno; u32 acthd; int score; |