diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 1717 |
1 files changed, 865 insertions, 852 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 8bc76fcff70d..594341e27a47 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -29,8 +29,8 @@ #include <linux/ascii85.h> #include <linux/nmi.h> +#include <linux/pagevec.h> #include <linux/scatterlist.h> -#include <linux/stop_machine.h> #include <linux/utsname.h> #include <linux/zlib.h> @@ -40,52 +40,17 @@ #include "display/intel_overlay.h" #include "gem/i915_gem_context.h" +#include "gem/i915_gem_lmem.h" +#include "gt/intel_gt_pm.h" #include "i915_drv.h" #include "i915_gpu_error.h" +#include "i915_memcpy.h" #include "i915_scatterlist.h" #include "intel_csr.h" -static inline const struct intel_engine_cs * -engine_lookup(const struct drm_i915_private *i915, unsigned int id) -{ - if (id >= I915_NUM_ENGINES) - return NULL; - - return i915->engine[id]; -} - -static inline const char * -__engine_name(const struct intel_engine_cs *engine) -{ - return engine ? engine->name : ""; -} - -static const char * -engine_name(const struct drm_i915_private *i915, unsigned int id) -{ - return __engine_name(engine_lookup(i915, id)); -} - -static const char *tiling_flag(int tiling) -{ - switch (tiling) { - default: - case I915_TILING_NONE: return ""; - case I915_TILING_X: return " X"; - case I915_TILING_Y: return " Y"; - } -} - -static const char *dirty_flag(int dirty) -{ - return dirty ? " dirty" : ""; -} - -static const char *purgeable_flag(int purgeable) -{ - return purgeable ? " purgeable" : ""; -} +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) static void __sg_set_buf(struct scatterlist *sg, void *addr, unsigned int len, loff_t it) @@ -114,7 +79,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) if (e->cur == e->end) { struct scatterlist *sgl; - sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); + sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); if (!sgl) { e->err = -ENOMEM; return false; @@ -134,7 +99,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) } e->size = ALIGN(len + 1, SZ_64K); - e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + e->buf = kmalloc(e->size, ALLOW_FAIL); if (!e->buf) { e->size = PAGE_ALIGN(len + 1); e->buf = kmalloc(e->size, GFP_KERNEL); @@ -211,63 +176,132 @@ i915_error_printer(struct drm_i915_error_state_buf *e) return p; } +/* single threaded page allocator with a reserved stash for emergencies */ +static void pool_fini(struct pagevec *pv) +{ + pagevec_release(pv); +} + +static int pool_refill(struct pagevec *pv, gfp_t gfp) +{ + while (pagevec_space(pv)) { + struct page *p; + + p = alloc_page(gfp); + if (!p) + return -ENOMEM; + + pagevec_add(pv, p); + } + + return 0; +} + +static int pool_init(struct pagevec *pv, gfp_t gfp) +{ + int err; + + pagevec_init(pv); + + err = pool_refill(pv, gfp); + if (err) + pool_fini(pv); + + return err; +} + +static void *pool_alloc(struct pagevec *pv, gfp_t gfp) +{ + struct page *p; + + p = alloc_page(gfp); + if (!p && pagevec_count(pv)) + p = pv->pages[--pv->nr]; + + return p ? page_address(p) : NULL; +} + +static void pool_free(struct pagevec *pv, void *addr) +{ + struct page *p = virt_to_page(addr); + + if (pagevec_space(pv)) + pagevec_add(pv, p); + else + __free_page(p); +} + #ifdef CONFIG_DRM_I915_COMPRESS_ERROR -struct compress { +struct i915_vma_compress { + struct pagevec pool; struct z_stream_s zstream; void *tmp; }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c) { - struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); + struct z_stream_s *zstream = &c->zstream; - zstream->workspace = - kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), - GFP_ATOMIC | __GFP_NOWARN); - if (!zstream->workspace) + if (pool_init(&c->pool, ALLOW_FAIL)) return false; - if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { - kfree(zstream->workspace); + zstream->workspace = + kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + ALLOW_FAIL); + if (!zstream->workspace) { + pool_fini(&c->pool); return false; } c->tmp = NULL; if (i915_has_memcpy_from_wc()) - c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); + c->tmp = pool_alloc(&c->pool, ALLOW_FAIL); return true; } -static void *compress_next_page(struct drm_i915_error_object *dst) +static bool compress_start(struct i915_vma_compress *c) +{ + struct z_stream_s *zstream = &c->zstream; + void *workspace = zstream->workspace; + + memset(zstream, 0, sizeof(*zstream)); + zstream->workspace = workspace; + + return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; +} + +static void *compress_next_page(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { - unsigned long page; + void *page; if (dst->page_count >= dst->num_pages) return ERR_PTR(-ENOSPC); - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + page = pool_alloc(&c->pool, ALLOW_FAIL); if (!page) return ERR_PTR(-ENOMEM); - return dst->pages[dst->page_count++] = (void *)page; + return dst->pages[dst->page_count++] = page; } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c, void *src, - struct drm_i915_error_object *dst) + struct i915_vma_coredump *dst, + bool wc) { struct z_stream_s *zstream = &c->zstream; zstream->next_in = src; - if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) + if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) zstream->next_in = c->tmp; zstream->avail_in = PAGE_SIZE; do { if (zstream->avail_out == 0) { - zstream->next_out = compress_next_page(dst); + zstream->next_out = compress_next_page(c, dst); if (IS_ERR(zstream->next_out)) return PTR_ERR(zstream->next_out); @@ -276,8 +310,6 @@ static int compress_page(struct compress *c, if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) return -EIO; - - touch_nmi_watchdog(); } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -287,15 +319,15 @@ static int compress_page(struct compress *c, return 0; } -static int compress_flush(struct compress *c, - struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { struct z_stream_s *zstream = &c->zstream; do { switch (zlib_deflate(zstream, Z_FINISH)) { case Z_OK: /* more space requested */ - zstream->next_out = compress_next_page(dst); + zstream->next_out = compress_next_page(c, dst); if (IS_ERR(zstream->next_out)) return PTR_ERR(zstream->next_out); @@ -316,15 +348,17 @@ end: return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct i915_vma_compress *c) { - struct z_stream_s *zstream = &c->zstream; + zlib_deflateEnd(&c->zstream); +} - zlib_deflateEnd(zstream); - kfree(zstream->workspace); +static void compress_fini(struct i915_vma_compress *c) +{ + kfree(c->zstream.workspace); if (c->tmp) - free_page((unsigned long)c->tmp); + pool_free(&c->pool, c->tmp); + pool_fini(&c->pool); } static void err_compression_marker(struct drm_i915_error_state_buf *m) @@ -334,44 +368,53 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #else -struct compress { +struct i915_vma_compress { + struct pagevec pool; }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c) +{ + return pool_init(&c->pool, ALLOW_FAIL) == 0; +} + +static bool compress_start(struct i915_vma_compress *c) { return true; } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c, void *src, - struct drm_i915_error_object *dst) + struct i915_vma_coredump *dst, + bool wc) { - unsigned long page; void *ptr; - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) + ptr = pool_alloc(&c->pool, ALLOW_FAIL); + if (!ptr) return -ENOMEM; - ptr = (void *)page; - if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) + if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) memcpy(ptr, src, PAGE_SIZE); dst->pages[dst->page_count++] = ptr; return 0; } -static int compress_flush(struct compress *c, - struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct i915_vma_compress *c) { } +static void compress_fini(struct i915_vma_compress *c) +{ + pool_fini(&c->pool); +} + static void err_compression_marker(struct drm_i915_error_state_buf *m) { err_puts(m, "~"); @@ -379,46 +422,17 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #endif -static void print_error_buffers(struct drm_i915_error_state_buf *m, - const char *name, - struct drm_i915_error_buffer *err, - int count) -{ - err_printf(m, "%s [%d]:\n", name, count); - - while (count--) { - err_printf(m, " %08x_%08x %8u %02x %02x", - upper_32_bits(err->gtt_offset), - lower_32_bits(err->gtt_offset), - err->size, - err->read_domains, - err->write_domain); - err_puts(m, tiling_flag(err->tiling)); - err_puts(m, dirty_flag(err->dirty)); - err_puts(m, purgeable_flag(err->purgeable)); - err_puts(m, err->userptr ? " userptr" : ""); - err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); - - if (err->name) - err_printf(m, " (name: %d)", err->name); - if (err->fence_reg != I915_FENCE_REG_NONE) - err_printf(m, " (fence: %d)", err->fence_reg); - - err_puts(m, "\n"); - err++; - } -} - static void error_print_instdone(struct drm_i915_error_state_buf *m, - const struct drm_i915_error_engine *ee) + const struct intel_engine_coredump *ee) { + const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu; int slice; int subslice; err_printf(m, " INSTDONE: 0x%08x\n", ee->instdone.instdone); - if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3) + if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3) return; err_printf(m, " SC_INSTDONE: 0x%08x\n", @@ -427,12 +441,12 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, if (INTEL_GEN(m->i915) <= 6) return; - for_each_instdone_slice_subslice(m->i915, slice, subslice) + for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice) err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", slice, subslice, ee->instdone.sampler[slice][subslice]); - for_each_instdone_slice_subslice(m->i915, slice, subslice) + for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice) err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", slice, subslice, ee->instdone.row[slice][subslice]); @@ -440,41 +454,56 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, - const struct drm_i915_error_request *erq, - const unsigned long epoch) + const struct i915_request_coredump *erq) { if (!erq->seqno) return; - err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n", prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &erq->flags) ? "+" : "", erq->sched_attr.priority, - jiffies_to_msecs(erq->jiffies - epoch), erq->start, erq->head, erq->tail); } static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, - const struct drm_i915_error_context *ctx) + const struct i915_gem_context_coredump *ctx) +{ + err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n", + header, ctx->comm, ctx->pid, ctx->sched_attr.priority, + ctx->guilty, ctx->active); +} + +static struct i915_vma_coredump * +__find_vma(struct i915_vma_coredump *vma, const char *name) +{ + while (vma) { + if (strcmp(vma->name, name) == 0) + return vma; + vma = vma->next; + } + + return NULL; +} + +static struct i915_vma_coredump * +find_batch(const struct intel_engine_coredump *ee) { - err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n", - header, ctx->comm, ctx->pid, ctx->hw_id, - ctx->sched_attr.priority, ctx->guilty, ctx->active); + return __find_vma(ee->vma, "batch"); } static void error_print_engine(struct drm_i915_error_state_buf *m, - const struct drm_i915_error_engine *ee, - const unsigned long epoch) + const struct intel_engine_coredump *ee) { + struct i915_vma_coredump *batch; int n; - err_printf(m, "%s command stream:\n", - engine_name(m->i915, ee->engine_id)); - err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); + err_printf(m, "%s command stream:\n", ee->engine->name); + err_printf(m, " CCID: 0x%08x\n", ee->ccid); err_printf(m, " START: 0x%08x\n", ee->start); err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", @@ -489,9 +518,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, error_print_instdone(m, ee); - if (ee->batchbuffer) { - u64 start = ee->batchbuffer->gtt_offset; - u64 end = start + ee->batchbuffer->gtt_size; + batch = find_batch(ee); + if (batch) { + u64 start = batch->gtt_offset; + u64 end = start + batch->gtt_size; err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", upper_32_bits(start), lower_32_bits(start), @@ -523,17 +553,11 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, ee->vm_info.pp_dir_base); } } - err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); - err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); - err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", - jiffies_to_msecs(ee->hangcheck_timestamp - epoch), - ee->hangcheck_timestamp, - ee->hangcheck_timestamp == epoch ? "; epoch" : ""); err_printf(m, " engine reset count: %u\n", ee->reset_count); for (n = 0; n < ee->num_ports; n++) { err_printf(m, " ELSP[%d]:", n); - error_print_request(m, " ", &ee->execlist[n], epoch); + error_print_request(m, " ", &ee->execlist[n]); } error_print_context(m, " Active context: ", &ee->context); @@ -548,35 +572,35 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) va_end(args); } -static void print_error_obj(struct drm_i915_error_state_buf *m, - struct intel_engine_cs *engine, - const char *name, - struct drm_i915_error_object *obj) +static void print_error_vma(struct drm_i915_error_state_buf *m, + const struct intel_engine_cs *engine, + const struct i915_vma_coredump *vma) { char out[ASCII85_BUFSZ]; int page; - if (!obj) + if (!vma) return; - if (name) { - err_printf(m, "%s --- %s = 0x%08x %08x\n", - engine ? engine->name : "global", name, - upper_32_bits(obj->gtt_offset), - lower_32_bits(obj->gtt_offset)); - } + err_printf(m, "%s --- %s = 0x%08x %08x\n", + engine ? engine->name : "global", vma->name, + upper_32_bits(vma->gtt_offset), + lower_32_bits(vma->gtt_offset)); + + if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) + err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes); err_compression_marker(m); - for (page = 0; page < obj->page_count; page++) { + for (page = 0; page < vma->page_count; page++) { int i, len; len = PAGE_SIZE; - if (page == obj->page_count - 1) - len -= obj->unused; + if (page == vma->page_count - 1) + len -= vma->unused; len = ascii85_encode_len(len); for (i = 0; i < len; i++) - err_puts(m, ascii85_encode(obj->pages[page][i], out)); + err_puts(m, ascii85_encode(vma->pages[page][i], out)); } err_puts(m, "\n"); } @@ -588,9 +612,10 @@ static void err_print_capabilities(struct drm_i915_error_state_buf *m, { struct drm_printer p = i915_error_printer(m); - intel_device_info_dump_flags(info, &p); + intel_device_info_print_static(info, &p); + intel_device_info_print_runtime(runtime, &p); + intel_device_info_print_topology(&runtime->sseu, &p); intel_driver_caps_print(caps, &p); - intel_device_info_dump_topology(&runtime->sseu, &p); } static void err_print_params(struct drm_i915_error_state_buf *m, @@ -614,18 +639,13 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, } static void err_print_uc(struct drm_i915_error_state_buf *m, - const struct i915_error_uc *error_uc) + const struct intel_uc_coredump *error_uc) { struct drm_printer p = i915_error_printer(m); - const struct i915_gpu_state *error = - container_of(error_uc, typeof(*error), uc); - - if (!error->device_info.has_guc) - return; intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); - print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); + print_error_vma(m, NULL, error_uc->guc_log); } static void err_free_sgl(struct scatterlist *sgl) @@ -645,18 +665,76 @@ static void err_free_sgl(struct scatterlist *sgl) } } +static void err_print_gt(struct drm_i915_error_state_buf *m, + struct intel_gt_coredump *gt) +{ + const struct intel_engine_coredump *ee; + int i; + + err_printf(m, "GT awake: %s\n", yesno(gt->awake)); + err_printf(m, "EIR: 0x%08x\n", gt->eir); + err_printf(m, "IER: 0x%08x\n", gt->ier); + for (i = 0; i < gt->ngtier; i++) + err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]); + err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er); + err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake); + err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); + + for (i = 0; i < gt->nfence; i++) + err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]); + + if (IS_GEN_RANGE(m->i915, 6, 11)) { + err_printf(m, "ERROR: 0x%08x\n", gt->error); + err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg); + } + + if (INTEL_GEN(m->i915) >= 8) + err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", + gt->fault_data1, gt->fault_data0); + + if (IS_GEN(m->i915, 7)) + err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int); + + if (IS_GEN_RANGE(m->i915, 8, 11)) + err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache); + + if (IS_GEN(m->i915, 12)) + err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err); + + if (INTEL_GEN(m->i915) >= 12) { + int i; + + for (i = 0; i < GEN12_SFC_DONE_MAX; i++) + err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i, + gt->sfc_done[i]); + + err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done); + } + + for (ee = gt->engine; ee; ee = ee->next) { + const struct i915_vma_coredump *vma; + + error_print_engine(m, ee); + for (vma = ee->vma; vma; vma = vma->next) + print_error_vma(m, ee->engine, vma); + } + + if (gt->uc) + err_print_uc(m, gt->uc); +} + static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, - struct i915_gpu_state *error) + struct i915_gpu_coredump *error) { - struct drm_i915_error_object *obj; + const struct intel_engine_coredump *ee; struct timespec64 ts; - int i, j; if (*error->error_msg) err_printf(m, "%s\n", error->error_msg); err_printf(m, "Kernel: %s %s\n", init_utsname()->release, init_utsname()->machine); + err_printf(m, "Driver: %s\n", DRIVER_DATE); ts = ktime_to_timespec64(error->time); err_printf(m, "Time: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); @@ -666,21 +744,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, ts = ktime_to_timespec64(error->uptime); err_printf(m, "Uptime: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); - err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); - err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", - error->capture, - jiffies_to_msecs(jiffies - error->capture), - jiffies_to_msecs(error->capture - error->epoch)); - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (!error->engine[i].context.pid) - continue; + err_printf(m, "Capture: %lu jiffies; %d ms ago\n", + error->capture, jiffies_to_msecs(jiffies - error->capture)); + for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next) err_printf(m, "Active process (on ring %s): %s [%d]\n", - engine_name(m->i915, i), - error->engine[i].context.comm, - error->engine[i].context.pid); - } + ee->engine->name, + ee->context.comm, + ee->context.pid); + err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); @@ -701,114 +773,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, CSR_VERSION_MINOR(csr->version)); } - err_printf(m, "GT awake: %s\n", yesno(error->awake)); err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); - err_printf(m, "EIR: 0x%08x\n", error->eir); - err_printf(m, "IER: 0x%08x\n", error->ier); - for (i = 0; i < error->ngtier; i++) - err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); - err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); - err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); - err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); - err_printf(m, "CCID: 0x%08x\n", error->ccid); - - for (i = 0; i < error->nfence; i++) - err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); - - if (INTEL_GEN(m->i915) >= 6) { - err_printf(m, "ERROR: 0x%08x\n", error->error); - - if (INTEL_GEN(m->i915) >= 8) - err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", - error->fault_data1, error->fault_data0); - - err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); - } - - if (IS_GEN(m->i915, 7)) - err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].engine_id != -1) - error_print_engine(m, &error->engine[i], error->epoch); - } - - for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { - char buf[128]; - int len, first = 1; - - if (!error->active_vm[i]) - break; - - len = scnprintf(buf, sizeof(buf), "Active ("); - for (j = 0; j < ARRAY_SIZE(error->engine); j++) { - if (error->engine[j].vm != error->active_vm[i]) - continue; - - len += scnprintf(buf + len, sizeof(buf), "%s%s", - first ? "" : ", ", - m->i915->engine[j]->name); - first = 0; - } - scnprintf(buf + len, sizeof(buf), ")"); - print_error_buffers(m, buf, - error->active_bo[i], - error->active_bo_count[i]); - } - - print_error_buffers(m, "Pinned (global)", - error->pinned_bo, - error->pinned_bo_count); - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; - - obj = ee->batchbuffer; - if (obj) { - err_puts(m, m->i915->engine[i]->name); - if (ee->context.pid) - err_printf(m, " (submitted by %s [%d])", - ee->context.comm, - ee->context.pid); - err_printf(m, " --- gtt_offset = 0x%08x %08x\n", - upper_32_bits(obj->gtt_offset), - lower_32_bits(obj->gtt_offset)); - print_error_obj(m, m->i915->engine[i], NULL, obj); - } - - for (j = 0; j < ee->user_bo_count; j++) - print_error_obj(m, m->i915->engine[i], - "user", ee->user_bo[j]); - - if (ee->num_requests) { - err_printf(m, "%s --- %d requests\n", - m->i915->engine[i]->name, - ee->num_requests); - for (j = 0; j < ee->num_requests; j++) - error_print_request(m, " ", - &ee->requests[j], - error->epoch); - } - - print_error_obj(m, m->i915->engine[i], - "ringbuffer", ee->ringbuffer); - - print_error_obj(m, m->i915->engine[i], - "HW Status", ee->hws_page); - print_error_obj(m, m->i915->engine[i], - "HW context", ee->ctx); - - print_error_obj(m, m->i915->engine[i], - "WA context", ee->wa_ctx); - - print_error_obj(m, m->i915->engine[i], - "WA batchbuffer", ee->wa_batchbuffer); - - print_error_obj(m, m->i915->engine[i], - "NULL context", ee->default_state); - } + if (error->gt) + err_print_gt(m, error->gt); if (error->overlay) intel_overlay_print_error_state(m, error->overlay); @@ -819,10 +788,9 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_print_capabilities(m, &error->device_info, &error->runtime_info, &error->driver_caps); err_print_params(m, &error->params); - err_print_uc(m, &error->uc); } -static int err_print_to_sgl(struct i915_gpu_state *error) +static int err_print_to_sgl(struct i915_gpu_coredump *error) { struct drm_i915_error_state_buf m; @@ -859,8 +827,8 @@ static int err_print_to_sgl(struct i915_gpu_state *error) return 0; } -ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, - char *buf, loff_t off, size_t rem) +ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error, + char *buf, loff_t off, size_t rem) { struct scatterlist *sg; size_t count; @@ -923,249 +891,224 @@ ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, return count; } -static void i915_error_object_free(struct drm_i915_error_object *obj) +static void i915_vma_coredump_free(struct i915_vma_coredump *vma) { - int page; - - if (obj == NULL) - return; + while (vma) { + struct i915_vma_coredump *next = vma->next; + int page; - for (page = 0; page < obj->page_count; page++) - free_page((unsigned long)obj->pages[page]); + for (page = 0; page < vma->page_count; page++) + free_page((unsigned long)vma->pages[page]); - kfree(obj); + kfree(vma); + vma = next; + } } - -static void cleanup_params(struct i915_gpu_state *error) +static void cleanup_params(struct i915_gpu_coredump *error) { i915_params_free(&error->params); } -static void cleanup_uc_state(struct i915_gpu_state *error) +static void cleanup_uc(struct intel_uc_coredump *uc) { - struct i915_error_uc *error_uc = &error->uc; + kfree(uc->guc_fw.path); + kfree(uc->huc_fw.path); + i915_vma_coredump_free(uc->guc_log); - kfree(error_uc->guc_fw.path); - kfree(error_uc->huc_fw.path); - i915_error_object_free(error_uc->guc_log); + kfree(uc); } -void __i915_gpu_state_free(struct kref *error_ref) +static void cleanup_gt(struct intel_gt_coredump *gt) { - struct i915_gpu_state *error = - container_of(error_ref, typeof(*error), ref); - long i, j; + while (gt->engine) { + struct intel_engine_coredump *ee = gt->engine; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; + gt->engine = ee->next; - for (j = 0; j < ee->user_bo_count; j++) - i915_error_object_free(ee->user_bo[j]); - kfree(ee->user_bo); + i915_vma_coredump_free(ee->vma); + kfree(ee); + } - i915_error_object_free(ee->batchbuffer); - i915_error_object_free(ee->wa_batchbuffer); - i915_error_object_free(ee->ringbuffer); - i915_error_object_free(ee->hws_page); - i915_error_object_free(ee->ctx); - i915_error_object_free(ee->wa_ctx); + if (gt->uc) + cleanup_uc(gt->uc); - kfree(ee->requests); - } + kfree(gt); +} - for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) - kfree(error->active_bo[i]); - kfree(error->pinned_bo); +void __i915_gpu_coredump_free(struct kref *error_ref) +{ + struct i915_gpu_coredump *error = + container_of(error_ref, typeof(*error), ref); + + while (error->gt) { + struct intel_gt_coredump *gt = error->gt; + + error->gt = gt->next; + cleanup_gt(gt); + } kfree(error->overlay); kfree(error->display); cleanup_params(error); - cleanup_uc_state(error); err_free_sgl(error->sgl); kfree(error); } -static struct drm_i915_error_object * -i915_error_object_create(struct drm_i915_private *i915, - struct i915_vma *vma) +static struct i915_vma_coredump * +i915_vma_coredump_create(const struct intel_gt *gt, + const struct i915_vma *vma, + const char *name, + struct i915_vma_compress *compress) { - struct i915_ggtt *ggtt = &i915->ggtt; + struct i915_ggtt *ggtt = gt->ggtt; const u64 slot = ggtt->error_capture.start; - struct drm_i915_error_object *dst; - struct compress compress; + struct i915_vma_coredump *dst; unsigned long num_pages; struct sgt_iter iter; - dma_addr_t dma; int ret; - if (!vma || !vma->pages) + might_sleep(); + + if (!vma || !vma->pages || !compress) return NULL; num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ - dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), - GFP_ATOMIC | __GFP_NOWARN); + dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL); if (!dst) return NULL; + if (!compress_start(compress)) { + kfree(dst); + return NULL; + } + + strcpy(dst->name, name); + dst->next = NULL; + dst->gtt_offset = vma->node.start; dst->gtt_size = vma->node.size; + dst->gtt_page_sizes = vma->page_sizes.gtt; dst->num_pages = num_pages; dst->page_count = 0; dst->unused = 0; - if (!compress_init(&compress)) { - kfree(dst); - return NULL; - } - ret = -EINVAL; - for_each_sgt_dma(dma, iter, vma->pages) { + if (drm_mm_node_allocated(&ggtt->error_capture)) { void __iomem *s; + dma_addr_t dma; + + for_each_sgt_daddr(dma, iter, vma->pages) { + ggtt->vm.insert_page(&ggtt->vm, dma, slot, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); + ret = compress_page(compress, + (void __force *)s, dst, + true); + io_mapping_unmap(s); + if (ret) + break; + } + } else if (i915_gem_object_is_lmem(vma->obj)) { + struct intel_memory_region *mem = vma->obj->mm.region; + dma_addr_t dma; + + for_each_sgt_daddr(dma, iter, vma->pages) { + void __iomem *s; + + s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE); + ret = compress_page(compress, + (void __force *)s, dst, + true); + io_mapping_unmap(s); + if (ret) + break; + } + } else { + struct page *page; - ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); + for_each_sgt_page(page, iter, vma->pages) { + void *s; - s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); - ret = compress_page(&compress, (void __force *)s, dst); - io_mapping_unmap_atomic(s); - if (ret) - break; + drm_clflush_pages(&page, 1); + + s = kmap(page); + ret = compress_page(compress, s, dst, false); + kunmap(page); + + drm_clflush_pages(&page, 1); + + if (ret) + break; + } } - if (ret || compress_flush(&compress, dst)) { + if (ret || compress_flush(compress, dst)) { while (dst->page_count--) - free_page((unsigned long)dst->pages[dst->page_count]); + pool_free(&compress->pool, dst->pages[dst->page_count]); kfree(dst); dst = NULL; } + compress_finish(compress); - compress_fini(&compress, dst); return dst; } -static void capture_bo(struct drm_i915_error_buffer *err, - struct i915_vma *vma) -{ - struct drm_i915_gem_object *obj = vma->obj; - - err->size = obj->base.size; - err->name = obj->base.name; - - err->gtt_offset = vma->node.start; - err->read_domains = obj->read_domains; - err->write_domain = obj->write_domain; - err->fence_reg = vma->fence ? vma->fence->id : -1; - err->tiling = i915_gem_object_get_tiling(obj); - err->dirty = obj->mm.dirty; - err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; - err->userptr = obj->userptr.mm != NULL; - err->cache_level = obj->cache_level; -} - -static u32 capture_error_bo(struct drm_i915_error_buffer *err, - int count, struct list_head *head, - unsigned int flags) -#define ACTIVE_ONLY BIT(0) -#define PINNED_ONLY BIT(1) -{ - struct i915_vma *vma; - int i = 0; - - list_for_each_entry(vma, head, vm_link) { - if (!vma->obj) - continue; - - if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma)) - continue; - - if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma)) - continue; - - capture_bo(err++, vma); - if (++i == count) - break; - } - - return i; -} - -/* - * Generate a semi-unique error code. The code is not meant to have meaning, The - * code's only purpose is to try to prevent false duplicated bug reports by - * grossly estimating a GPU error state. - * - * TODO Ideally, hashing the batchbuffer would be a very nice way to determine - * the hang if we could strip the GTT offset information from it. - * - * It's only a small step better than a random number in its current form. - */ -static u32 i915_error_generate_code(struct i915_gpu_state *error, - intel_engine_mask_t engine_mask) +static void gt_record_fences(struct intel_gt_coredump *gt) { - /* - * IPEHR would be an ideal way to detect errors, as it's the gross - * measure of "the command that hung." However, has some very common - * synchronization commands which almost always appear in the case - * strictly a client bug. Use instdone to differentiate those some. - */ - if (engine_mask) { - struct drm_i915_error_engine *ee = - &error->engine[ffs(engine_mask)]; - - return ee->ipehr ^ ee->instdone.instdone; - } - - return 0; -} - -static void gem_record_fences(struct i915_gpu_state *error) -{ - struct drm_i915_private *dev_priv = error->i915; - struct intel_uncore *uncore = &dev_priv->uncore; + struct i915_ggtt *ggtt = gt->_gt->ggtt; + struct intel_uncore *uncore = gt->_gt->uncore; int i; - if (INTEL_GEN(dev_priv) >= 6) { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + if (INTEL_GEN(uncore->i915) >= 6) { + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read64(uncore, FENCE_REG_GEN6_LO(i)); - } else if (INTEL_GEN(dev_priv) >= 4) { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + } else if (INTEL_GEN(uncore->i915) >= 4) { + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read64(uncore, FENCE_REG_965_LO(i)); } else { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read(uncore, FENCE_REG(i)); } - error->nfence = i; + gt->nfence = i; } -static void error_record_engine_registers(struct i915_gpu_state *error, - struct intel_engine_cs *engine, - struct drm_i915_error_engine *ee) +static void engine_record_registers(struct intel_engine_coredump *ee) { - struct drm_i915_private *dev_priv = engine->i915; + const struct intel_engine_cs *engine = ee->engine; + struct drm_i915_private *i915 = engine->i915; - if (INTEL_GEN(dev_priv) >= 6) { + if (INTEL_GEN(i915) >= 6) { ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); - if (INTEL_GEN(dev_priv) >= 8) - ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); + + if (INTEL_GEN(i915) >= 12) + ee->fault_reg = intel_uncore_read(engine->uncore, + GEN12_RING_FAULT_REG); + else if (INTEL_GEN(i915) >= 8) + ee->fault_reg = intel_uncore_read(engine->uncore, + GEN8_RING_FAULT_REG); else ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); } - if (INTEL_GEN(dev_priv) >= 4) { + if (INTEL_GEN(i915) >= 4) { ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); ee->ipeir = ENGINE_READ(engine, RING_IPEIR); ee->ipehr = ENGINE_READ(engine, RING_IPEHR); ee->instps = ENGINE_READ(engine, RING_INSTPS); ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); - if (INTEL_GEN(dev_priv) >= 8) { + ee->ccid = ENGINE_READ(engine, CCID); + if (INTEL_GEN(i915) >= 8) { ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; } @@ -1184,13 +1127,13 @@ static void error_record_engine_registers(struct i915_gpu_state *error, ee->head = ENGINE_READ(engine, RING_HEAD); ee->tail = ENGINE_READ(engine, RING_TAIL); ee->ctl = ENGINE_READ(engine, RING_CTL); - if (INTEL_GEN(dev_priv) > 2) + if (INTEL_GEN(i915) > 2) ee->mode = ENGINE_READ(engine, RING_MI_MODE); - if (!HWS_NEEDS_PHYSICAL(dev_priv)) { + if (!HWS_NEEDS_PHYSICAL(i915)) { i915_reg_t mmio; - if (IS_GEN(dev_priv, 7)) { + if (IS_GEN(i915, 7)) { switch (engine->id) { default: MISSING_CASE(engine->id); @@ -1215,184 +1158,155 @@ static void error_record_engine_registers(struct i915_gpu_state *error, mmio = RING_HWS_PGA(engine->mmio_base); } - ee->hws = I915_READ(mmio); + ee->hws = intel_uncore_read(engine->uncore, mmio); } - ee->idle = intel_engine_is_idle(engine); - if (!ee->idle) - ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; - ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, - engine); + ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine); - if (HAS_PPGTT(dev_priv)) { + if (HAS_PPGTT(i915)) { int i; ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); - if (IS_GEN(dev_priv, 6)) { + if (IS_GEN(i915, 6)) { ee->vm_info.pp_dir_base = ENGINE_READ(engine, RING_PP_DIR_BASE_READ); - } else if (IS_GEN(dev_priv, 7)) { + } else if (IS_GEN(i915, 7)) { ee->vm_info.pp_dir_base = ENGINE_READ(engine, RING_PP_DIR_BASE); - } else if (INTEL_GEN(dev_priv) >= 8) { + } else if (INTEL_GEN(i915) >= 8) { u32 base = engine->mmio_base; for (i = 0; i < 4; i++) { ee->vm_info.pdp[i] = - I915_READ(GEN8_RING_PDP_UDW(base, i)); + intel_uncore_read(engine->uncore, + GEN8_RING_PDP_UDW(base, i)); ee->vm_info.pdp[i] <<= 32; ee->vm_info.pdp[i] |= - I915_READ(GEN8_RING_PDP_LDW(base, i)); + intel_uncore_read(engine->uncore, + GEN8_RING_PDP_LDW(base, i)); } } } } -static void record_request(struct i915_request *request, - struct drm_i915_error_request *erq) +static void record_request(const struct i915_request *request, + struct i915_request_coredump *erq) { - struct i915_gem_context *ctx = request->gem_context; + const struct i915_gem_context *ctx; erq->flags = request->fence.flags; erq->context = request->fence.context; erq->seqno = request->fence.seqno; erq->sched_attr = request->sched.attr; - erq->jiffies = request->emitted_jiffies; erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; erq->tail = request->tail; + erq->pid = 0; rcu_read_lock(); - erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; + ctx = rcu_dereference(request->context->gem_context); + if (ctx) + erq->pid = pid_nr(ctx->pid); rcu_read_unlock(); } -static void engine_record_requests(struct intel_engine_cs *engine, - struct i915_request *first, - struct drm_i915_error_engine *ee) +static void engine_record_execlists(struct intel_engine_coredump *ee) { - struct i915_request *request; - int count; + const struct intel_engine_execlists * const el = &ee->engine->execlists; + struct i915_request * const *port = el->active; + unsigned int n = 0; - count = 0; - request = first; - list_for_each_entry_from(request, &engine->active.requests, sched.link) - count++; - if (!count) - return; - - ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); - if (!ee->requests) - return; - - ee->num_requests = count; - - count = 0; - request = first; - list_for_each_entry_from(request, - &engine->active.requests, sched.link) { - if (count >= ee->num_requests) { - /* - * If the ring request list was changed in - * between the point where the error request - * list was created and dimensioned and this - * point then just exit early to avoid crashes. - * - * We don't need to communicate that the - * request list changed state during error - * state capture and that the error state is - * slightly incorrect as a consequence since we - * are typically only interested in the request - * list state at the point of error state - * capture, not in any changes happening during - * the capture. - */ - break; - } + while (*port) + record_request(*port++, &ee->execlist[n++]); - record_request(request, &ee->requests[count++]); - } - ee->num_requests = count; + ee->num_ports = n; } -static void error_record_engine_execlists(struct intel_engine_cs *engine, - struct drm_i915_error_engine *ee) +static bool record_context(struct i915_gem_context_coredump *e, + const struct i915_request *rq) { - const struct intel_engine_execlists * const execlists = &engine->execlists; - unsigned int n; - - for (n = 0; n < execlists_num_ports(execlists); n++) { - struct i915_request *rq = port_request(&execlists->port[n]); + struct i915_gem_context *ctx; + struct task_struct *task; + bool capture; - if (!rq) - break; + rcu_read_lock(); + ctx = rcu_dereference(rq->context->gem_context); + if (ctx && !kref_get_unless_zero(&ctx->ref)) + ctx = NULL; + rcu_read_unlock(); + if (!ctx) + return false; - record_request(rq, &ee->execlist[n]); + rcu_read_lock(); + task = pid_task(ctx->pid, PIDTYPE_PID); + if (task) { + strcpy(e->comm, task->comm); + e->pid = task->pid; } + rcu_read_unlock(); - ee->num_ports = n; + e->sched_attr = ctx->sched; + e->guilty = atomic_read(&ctx->guilty_count); + e->active = atomic_read(&ctx->active_count); + + capture = i915_gem_context_no_error_capture(ctx); + + i915_gem_context_put(ctx); + return capture; } -static void record_context(struct drm_i915_error_context *e, - struct i915_gem_context *ctx) +struct intel_engine_capture_vma { + struct intel_engine_capture_vma *next; + struct i915_vma *vma; + char name[16]; +}; + +static struct intel_engine_capture_vma * +capture_vma(struct intel_engine_capture_vma *next, + struct i915_vma *vma, + const char *name, + gfp_t gfp) { - if (ctx->pid) { - struct task_struct *task; + struct intel_engine_capture_vma *c; - rcu_read_lock(); - task = pid_task(ctx->pid, PIDTYPE_PID); - if (task) { - strcpy(e->comm, task->comm); - e->pid = task->pid; - } - rcu_read_unlock(); + if (!vma) + return next; + + c = kmalloc(sizeof(*c), gfp); + if (!c) + return next; + + if (!i915_active_acquire_if_busy(&vma->active)) { + kfree(c); + return next; } - e->hw_id = ctx->hw_id; - e->sched_attr = ctx->sched; - e->guilty = atomic_read(&ctx->guilty_count); - e->active = atomic_read(&ctx->active_count); + strcpy(c->name, name); + c->vma = i915_vma_get(vma); + + c->next = next; + return c; } -static void request_record_user_bo(struct i915_request *request, - struct drm_i915_error_engine *ee) +static struct intel_engine_capture_vma * +capture_user(struct intel_engine_capture_vma *capture, + const struct i915_request *rq, + gfp_t gfp) { struct i915_capture_list *c; - struct drm_i915_error_object **bo; - long count, max; - max = 0; - for (c = request->capture_list; c; c = c->next) - max++; - if (!max) - return; - - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); - if (!bo) { - /* If we can't capture everything, try to capture something. */ - max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); - } - if (!bo) - return; + for (c = rq->capture_list; c; c = c->next) + capture = capture_vma(capture, c->vma, "user", gfp); - count = 0; - for (c = request->capture_list; c; c = c->next) { - bo[count] = i915_error_object_create(request->i915, c->vma); - if (!bo[count]) - break; - if (++count == max) - break; - } - - ee->user_bo = bo; - ee->user_bo_count = count; + return capture; } -static struct drm_i915_error_object * -capture_object(struct drm_i915_private *dev_priv, - struct drm_i915_gem_object *obj) +static struct i915_vma_coredump * +capture_object(const struct intel_gt *gt, + struct drm_i915_gem_object *obj, + const char *name, + struct i915_vma_compress *compress) { if (obj && i915_gem_object_has_pages(obj)) { struct i915_vma fake = { @@ -1402,190 +1316,221 @@ capture_object(struct drm_i915_private *dev_priv, .obj = obj, }; - return i915_error_object_create(dev_priv, &fake); + return i915_vma_coredump_create(gt, &fake, name, compress); } else { return NULL; } } -static void gem_record_rings(struct i915_gpu_state *error) +static void add_vma(struct intel_engine_coredump *ee, + struct i915_vma_coredump *vma) { - struct drm_i915_private *i915 = error->i915; - struct i915_ggtt *ggtt = &i915->ggtt; - int i; + if (vma) { + vma->next = ee->vma; + ee->vma = vma; + } +} - for (i = 0; i < I915_NUM_ENGINES; i++) { - struct intel_engine_cs *engine = i915->engine[i]; - struct drm_i915_error_engine *ee = &error->engine[i]; - struct i915_request *request; - unsigned long flags; +struct intel_engine_coredump * +intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp) +{ + struct intel_engine_coredump *ee; - ee->engine_id = -1; + ee = kzalloc(sizeof(*ee), gfp); + if (!ee) + return NULL; - if (!engine) - continue; + ee->engine = engine; - ee->engine_id = i; + engine_record_registers(ee); + engine_record_execlists(ee); - error_record_engine_registers(error, engine, ee); - error_record_engine_execlists(engine, ee); + return ee; +} - spin_lock_irqsave(&engine->active.lock, flags); - request = intel_engine_find_active_request(engine); - if (request) { - struct i915_gem_context *ctx = request->gem_context; - struct intel_ring *ring = request->ring; +struct intel_engine_capture_vma * +intel_engine_coredump_add_request(struct intel_engine_coredump *ee, + struct i915_request *rq, + gfp_t gfp) +{ + struct intel_engine_capture_vma *vma = NULL; - ee->vm = ctx->vm ?: &ggtt->vm; + ee->simulated |= record_context(&ee->context, rq); + if (ee->simulated) + return NULL; - record_context(&ee->context, ctx); + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + vma = capture_vma(vma, rq->batch, "batch", gfp); + vma = capture_user(vma, rq, gfp); + vma = capture_vma(vma, rq->ring->vma, "ring", gfp); + vma = capture_vma(vma, rq->context->state, "HW context", gfp); - /* We need to copy these to an anonymous buffer - * as the simplest method to avoid being overwritten - * by userspace. - */ - ee->batchbuffer = - i915_error_object_create(i915, request->batch); + ee->rq_head = rq->head; + ee->rq_post = rq->postfix; + ee->rq_tail = rq->tail; - if (HAS_BROKEN_CS_TLB(i915)) - ee->wa_batchbuffer = - i915_error_object_create(i915, - i915->gt.scratch); - request_record_user_bo(request, ee); + return vma; +} - ee->ctx = - i915_error_object_create(i915, - request->hw_context->state); +void +intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, + struct intel_engine_capture_vma *capture, + struct i915_vma_compress *compress) +{ + const struct intel_engine_cs *engine = ee->engine; - error->simulated |= - i915_gem_context_no_error_capture(ctx); + while (capture) { + struct intel_engine_capture_vma *this = capture; + struct i915_vma *vma = this->vma; - ee->rq_head = request->head; - ee->rq_post = request->postfix; - ee->rq_tail = request->tail; + add_vma(ee, + i915_vma_coredump_create(engine->gt, + vma, this->name, + compress)); - ee->cpu_ring_head = ring->head; - ee->cpu_ring_tail = ring->tail; - ee->ringbuffer = - i915_error_object_create(i915, ring->vma); + i915_active_release(&vma->active); + i915_vma_put(vma); - engine_record_requests(engine, request, ee); - } - spin_unlock_irqrestore(&engine->active.lock, flags); + capture = this->next; + kfree(this); + } - ee->hws_page = - i915_error_object_create(i915, - engine->status_page.vma); + add_vma(ee, + i915_vma_coredump_create(engine->gt, + engine->status_page.vma, + "HW Status", + compress)); - ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); + add_vma(ee, + i915_vma_coredump_create(engine->gt, + engine->wa_ctx.vma, + "WA context", + compress)); - ee->default_state = capture_object(i915, engine->default_state); - } + add_vma(ee, + capture_object(engine->gt, + engine->default_state, + "NULL context", + compress)); } -static void gem_capture_vm(struct i915_gpu_state *error, - struct i915_address_space *vm, - int idx) +static struct intel_engine_coredump * +capture_engine(struct intel_engine_cs *engine, + struct i915_vma_compress *compress) { - struct drm_i915_error_buffer *active_bo; - struct i915_vma *vma; - int count; + struct intel_engine_capture_vma *capture = NULL; + struct intel_engine_coredump *ee; + struct i915_request *rq; + unsigned long flags; - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - if (i915_vma_is_active(vma)) - count++; - - active_bo = NULL; - if (count) - active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); - if (active_bo) - count = capture_error_bo(active_bo, - count, &vm->bound_list, - ACTIVE_ONLY); - else - count = 0; + ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); + if (!ee) + return NULL; - error->active_vm[idx] = vm; - error->active_bo[idx] = active_bo; - error->active_bo_count[idx] = count; + spin_lock_irqsave(&engine->active.lock, flags); + rq = intel_engine_find_active_request(engine); + if (rq) + capture = intel_engine_coredump_add_request(ee, rq, + ATOMIC_MAYFAIL); + spin_unlock_irqrestore(&engine->active.lock, flags); + if (!capture) { + kfree(ee); + return NULL; + } + + intel_engine_coredump_add_vma(ee, capture, compress); + + return ee; } -static void capture_active_buffers(struct i915_gpu_state *error) +static void +gt_record_engines(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) { - int cnt = 0, i, j; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, gt->_gt, id) { + struct intel_engine_coredump *ee; - BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); + /* Refill our page pool before entering atomic section */ + pool_refill(&compress->pool, ALLOW_FAIL); - /* Scan each engine looking for unique active contexts/vm */ - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; - bool found; + ee = capture_engine(engine, compress); + if (!ee) + continue; - if (!ee->vm) + gt->simulated |= ee->simulated; + if (ee->simulated) { + kfree(ee); continue; + } - found = false; - for (j = 0; j < i && !found; j++) - found = error->engine[j].vm == ee->vm; - if (!found) - gem_capture_vm(error, ee->vm, cnt++); + ee->next = gt->engine; + gt->engine = ee; } } -static void capture_pinned_buffers(struct i915_gpu_state *error) +static struct intel_uc_coredump * +gt_record_uc(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) { - struct i915_address_space *vm = &error->i915->ggtt.vm; - struct drm_i915_error_buffer *bo; - struct i915_vma *vma; - int count; + const struct intel_uc *uc = >->_gt->uc; + struct intel_uc_coredump *error_uc; - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - count++; + error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL); + if (!error_uc) + return NULL; - bo = NULL; - if (count) - bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); - if (!bo) - return; + memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); + memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); - error->pinned_bo_count = - capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY); - error->pinned_bo = bo; + /* Non-default firmware paths will be specified by the modparam. + * As modparams are generally accesible from the userspace make + * explicit copies of the firmware paths. + */ + error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL); + error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); + error_uc->guc_log = + i915_vma_coredump_create(gt->_gt, + uc->guc.log.vma, "GuC log buffer", + compress); + + return error_uc; } -static void capture_uc_state(struct i915_gpu_state *error) +static void gt_capture_prepare(struct intel_gt_coredump *gt) { - struct drm_i915_private *i915 = error->i915; - struct i915_error_uc *error_uc = &error->uc; + struct i915_ggtt *ggtt = gt->_gt->ggtt; - /* Capturing uC state won't be useful if there is no GuC */ - if (!error->device_info.has_guc) - return; + mutex_lock(&ggtt->error_mutex); +} - error_uc->guc_fw = i915->guc.fw; - error_uc->huc_fw = i915->huc.fw; +static void gt_capture_finish(struct intel_gt_coredump *gt) +{ + struct i915_ggtt *ggtt = gt->_gt->ggtt; - /* Non-default firmware paths will be specified by the modparam. - * As modparams are generally accesible from the userspace make - * explicit copies of the firmware paths. - */ - error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); - error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); - error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); + if (drm_mm_node_allocated(&ggtt->error_capture)) + ggtt->vm.clear_range(&ggtt->vm, + ggtt->error_capture.start, + PAGE_SIZE); + + mutex_unlock(&ggtt->error_mutex); } /* Capture all registers which don't fit into another category. */ -static void capture_reg_state(struct i915_gpu_state *error) +static void gt_record_regs(struct intel_gt_coredump *gt) { - struct drm_i915_private *i915 = error->i915; - struct intel_uncore *uncore = &i915->uncore; + struct intel_uncore *uncore = gt->_gt->uncore; + struct drm_i915_private *i915 = uncore->i915; int i; - /* General organization + /* + * General organization * 1. Registers specific to a single generation * 2. Registers which belong to multiple generations * 3. Feature specific registers. @@ -1595,122 +1540,162 @@ static void capture_reg_state(struct i915_gpu_state *error) /* 1: Registers specific to a single generation */ if (IS_VALLEYVIEW(i915)) { - error->gtier[0] = intel_uncore_read(uncore, GTIER); - error->ier = intel_uncore_read(uncore, VLV_IER); - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); + gt->gtier[0] = intel_uncore_read(uncore, GTIER); + gt->ier = intel_uncore_read(uncore, VLV_IER); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); } if (IS_GEN(i915, 7)) - error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); + gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); - if (INTEL_GEN(i915) >= 8) { - error->fault_data0 = intel_uncore_read(uncore, - GEN8_FAULT_TLB_DATA0); - error->fault_data1 = intel_uncore_read(uncore, - GEN8_FAULT_TLB_DATA1); + if (INTEL_GEN(i915) >= 12) { + gt->fault_data0 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA0); + gt->fault_data1 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA1); + } else if (INTEL_GEN(i915) >= 8) { + gt->fault_data0 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA0); + gt->fault_data1 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA1); } if (IS_GEN(i915, 6)) { - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); - error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); - error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); + gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL); + gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE); } /* 2: Registers which belong to multiple generations */ if (INTEL_GEN(i915) >= 7) - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); if (INTEL_GEN(i915) >= 6) { - error->derrmr = intel_uncore_read(uncore, DERRMR); - error->error = intel_uncore_read(uncore, ERROR_GEN6); - error->done_reg = intel_uncore_read(uncore, DONE_REG); + gt->derrmr = intel_uncore_read(uncore, DERRMR); + if (INTEL_GEN(i915) < 12) { + gt->error = intel_uncore_read(uncore, ERROR_GEN6); + gt->done_reg = intel_uncore_read(uncore, DONE_REG); + } } - if (INTEL_GEN(i915) >= 5) - error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); - /* 3: Feature specific registers */ if (IS_GEN_RANGE(i915, 6, 7)) { - error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); - error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); + gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); + gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); + } + + if (IS_GEN_RANGE(i915, 8, 11)) + gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); + + if (IS_GEN(i915, 12)) + gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); + + if (INTEL_GEN(i915) >= 12) { + for (i = 0; i < GEN12_SFC_DONE_MAX; i++) { + gt->sfc_done[i] = + intel_uncore_read(uncore, GEN12_SFC_DONE(i)); + } + + gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); } /* 4: Everything else */ if (INTEL_GEN(i915) >= 11) { - error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); - error->gtier[0] = + gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); + gt->gtier[0] = intel_uncore_read(uncore, GEN11_RENDER_COPY_INTR_ENABLE); - error->gtier[1] = + gt->gtier[1] = intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); - error->gtier[2] = + gt->gtier[2] = intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); - error->gtier[3] = + gt->gtier[3] = intel_uncore_read(uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE); - error->gtier[4] = + gt->gtier[4] = intel_uncore_read(uncore, GEN11_CRYPTO_RSVD_INTR_ENABLE); - error->gtier[5] = + gt->gtier[5] = intel_uncore_read(uncore, GEN11_GUNIT_CSME_INTR_ENABLE); - error->ngtier = 6; + gt->ngtier = 6; } else if (INTEL_GEN(i915) >= 8) { - error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); + gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); for (i = 0; i < 4; i++) - error->gtier[i] = intel_uncore_read(uncore, - GEN8_GT_IER(i)); - error->ngtier = 4; + gt->gtier[i] = + intel_uncore_read(uncore, GEN8_GT_IER(i)); + gt->ngtier = 4; } else if (HAS_PCH_SPLIT(i915)) { - error->ier = intel_uncore_read(uncore, DEIER); - error->gtier[0] = intel_uncore_read(uncore, GTIER); - error->ngtier = 1; + gt->ier = intel_uncore_read(uncore, DEIER); + gt->gtier[0] = intel_uncore_read(uncore, GTIER); + gt->ngtier = 1; } else if (IS_GEN(i915, 2)) { - error->ier = intel_uncore_read16(uncore, GEN2_IER); + gt->ier = intel_uncore_read16(uncore, GEN2_IER); } else if (!IS_VALLEYVIEW(i915)) { - error->ier = intel_uncore_read(uncore, GEN2_IER); + gt->ier = intel_uncore_read(uncore, GEN2_IER); } - error->eir = intel_uncore_read(uncore, EIR); - error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); + gt->eir = intel_uncore_read(uncore, EIR); + gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); +} + +/* + * Generate a semi-unique error code. The code is not meant to have meaning, The + * code's only purpose is to try to prevent false duplicated bug reports by + * grossly estimating a GPU error state. + * + * TODO Ideally, hashing the batchbuffer would be a very nice way to determine + * the hang if we could strip the GTT offset information from it. + * + * It's only a small step better than a random number in its current form. + */ +static u32 generate_ecode(const struct intel_engine_coredump *ee) +{ + /* + * IPEHR would be an ideal way to detect errors, as it's the gross + * measure of "the command that hung." However, has some very common + * synchronization commands which almost always appear in the case + * strictly a client bug. Use instdone to differentiate those some. + */ + return ee ? ee->ipehr ^ ee->instdone.instdone : 0; } -static const char * -error_msg(struct i915_gpu_state *error, - intel_engine_mask_t engines, const char *msg) +static const char *error_msg(struct i915_gpu_coredump *error) { + struct intel_engine_coredump *first = NULL; + struct intel_gt_coredump *gt; + intel_engine_mask_t engines; int len; - int i; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) - if (!error->engine[i].context.pid) - engines &= ~BIT(i); + engines = 0; + for (gt = error->gt; gt; gt = gt->next) { + struct intel_engine_coredump *cs; + + if (gt->engine && !first) + first = gt->engine; + + for (cs = gt->engine; cs; cs = cs->next) + engines |= cs->engine->mask; + } len = scnprintf(error->error_msg, sizeof(error->error_msg), - "GPU HANG: ecode %d:%x:0x%08x", + "GPU HANG: ecode %d:%x:%08x", INTEL_GEN(error->i915), engines, - i915_error_generate_code(error, engines)); - if (engines) { + generate_ecode(first)); + if (first && first->context.pid) { /* Just show the first executing process, more is confusing */ - i = __ffs(engines); len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, ", in %s [%d]", - error->engine[i].context.comm, - error->engine[i].context.pid); + first->context.comm, first->context.pid); } - if (msg) - len += scnprintf(error->error_msg + len, - sizeof(error->error_msg) - len, - ", %s", msg); return error->error_msg; } -static void capture_gen_state(struct i915_gpu_state *error) +static void capture_gen(struct i915_gpu_coredump *error) { struct drm_i915_private *i915 = error->i915; - error->awake = i915->gt.awake; error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); error->suspended = i915->runtime_pm.suspended; @@ -1721,6 +1706,7 @@ static void capture_gen_state(struct i915_gpu_state *error) error->reset_count = i915_reset_count(&i915->gpu_error); error->suspend_count = i915->suspend_count; + i915_params_copy(&error->params, &i915_modparams); memcpy(&error->device_info, INTEL_INFO(i915), sizeof(error->device_info)); @@ -1730,155 +1716,182 @@ static void capture_gen_state(struct i915_gpu_state *error) error->driver_caps = i915->caps; } -static void capture_params(struct i915_gpu_state *error) +struct i915_gpu_coredump * +i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp) { - i915_params_copy(&error->params, &i915_modparams); -} + struct i915_gpu_coredump *error; -static unsigned long capture_find_epoch(const struct i915_gpu_state *error) -{ - unsigned long epoch = error->capture; - int i; + if (!i915_modparams.error_capture) + return NULL; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; + error = kzalloc(sizeof(*error), gfp); + if (!error) + return NULL; - if (ee->hangcheck_timestamp && - time_before(ee->hangcheck_timestamp, epoch)) - epoch = ee->hangcheck_timestamp; - } + kref_init(&error->ref); + error->i915 = i915; + + error->time = ktime_get_real(); + error->boottime = ktime_get_boottime(); + error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); + error->capture = jiffies; + + capture_gen(error); - return epoch; + return error; } -static void capture_finish(struct i915_gpu_state *error) +#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) + +struct intel_gt_coredump * +intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp) { - struct i915_ggtt *ggtt = &error->i915->ggtt; - const u64 slot = ggtt->error_capture.start; + struct intel_gt_coredump *gc; + + gc = kzalloc(sizeof(*gc), gfp); + if (!gc) + return NULL; - ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); + gc->_gt = gt; + gc->awake = intel_gt_pm_is_awake(gt); + + gt_record_regs(gc); + gt_record_fences(gc); + + return gc; } -static int capture(void *data) +struct i915_vma_compress * +i915_vma_capture_prepare(struct intel_gt_coredump *gt) { - struct i915_gpu_state *error = data; - - error->time = ktime_get_real(); - error->boottime = ktime_get_boottime(); - error->uptime = ktime_sub(ktime_get(), - error->i915->gt.last_init_time); - error->capture = jiffies; + struct i915_vma_compress *compress; - capture_params(error); - capture_gen_state(error); - capture_uc_state(error); - capture_reg_state(error); - gem_record_fences(error); - gem_record_rings(error); - capture_active_buffers(error); - capture_pinned_buffers(error); + compress = kmalloc(sizeof(*compress), ALLOW_FAIL); + if (!compress) + return NULL; - error->overlay = intel_overlay_capture_error_state(error->i915); - error->display = intel_display_capture_error_state(error->i915); + if (!compress_init(compress)) { + kfree(compress); + return NULL; + } - error->epoch = capture_find_epoch(error); + gt_capture_prepare(gt); - capture_finish(error); - return 0; + return compress; } -#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) +void i915_vma_capture_finish(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) +{ + if (!compress) + return; + + gt_capture_finish(gt); + + compress_fini(compress); + kfree(compress); +} -struct i915_gpu_state * -i915_capture_gpu_state(struct drm_i915_private *i915) +struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) { - struct i915_gpu_state *error; + struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ error = READ_ONCE(i915->gpu_error.first_error); if (IS_ERR(error)) return error; - error = kzalloc(sizeof(*error), GFP_ATOMIC); - if (!error) { - i915_disable_error_state(i915, -ENOMEM); + error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL); + if (!error) return ERR_PTR(-ENOMEM); - } - kref_init(&error->ref); - error->i915 = i915; + error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL); + if (error->gt) { + struct i915_vma_compress *compress; + + compress = i915_vma_capture_prepare(error->gt); + if (!compress) { + kfree(error->gt); + kfree(error); + return ERR_PTR(-ENOMEM); + } + + gt_record_engines(error->gt, compress); + + if (INTEL_INFO(i915)->has_gt_uc) + error->gt->uc = gt_record_uc(error->gt, compress); - stop_machine(capture, error, NULL); + i915_vma_capture_finish(error->gt, compress); + + error->simulated |= error->gt->simulated; + } + + error->overlay = intel_overlay_capture_error_state(i915); + error->display = intel_display_capture_error_state(i915); return error; } -/** - * i915_capture_error_state - capture an error record for later analysis - * @i915: i915 device - * @engine_mask: the mask of engines triggering the hang - * @msg: a message to insert into the error capture header - * - * Should be called when an error is detected (either a hang or an error - * interrupt) to capture error state from the time of the error. Fills - * out a structure which becomes available in debugfs for user level tools - * to pick up. - */ -void i915_capture_error_state(struct drm_i915_private *i915, - intel_engine_mask_t engine_mask, - const char *msg) +void i915_error_state_store(struct i915_gpu_coredump *error) { + struct drm_i915_private *i915; static bool warned; - struct i915_gpu_state *error; - unsigned long flags; - if (!i915_modparams.error_capture) + if (IS_ERR_OR_NULL(error)) return; - if (READ_ONCE(i915->gpu_error.first_error)) - return; + i915 = error->i915; + dev_info(i915->drm.dev, "%s\n", error_msg(error)); - error = i915_capture_gpu_state(i915); - if (IS_ERR(error)) + if (error->simulated || + cmpxchg(&i915->gpu_error.first_error, NULL, error)) return; - dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); + i915_gpu_coredump_get(error); - if (!error->simulated) { - spin_lock_irqsave(&i915->gpu_error.lock, flags); - if (!i915->gpu_error.first_error) { - i915->gpu_error.first_error = error; - error = NULL; - } - spin_unlock_irqrestore(&i915->gpu_error.lock, flags); + if (!xchg(&warned, true) && + ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { + pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); + pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); + pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); + pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); + pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", + i915->drm.primary->index); } +} - if (error) { - __i915_gpu_state_free(&error->ref); +/** + * i915_capture_error_state - capture an error record for later analysis + * @i915: i915 device + * + * Should be called when an error is detected (either a hang or an error + * interrupt) to capture error state from the time of the error. Fills + * out a structure which becomes available in debugfs for user level tools + * to pick up. + */ +void i915_capture_error_state(struct drm_i915_private *i915) +{ + struct i915_gpu_coredump *error; + + error = i915_gpu_coredump(i915); + if (IS_ERR(error)) { + cmpxchg(&i915->gpu_error.first_error, NULL, error); return; } - if (!warned && - ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { - DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); - DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); - DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); - DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); - DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", - i915->drm.primary->index); - warned = true; - } + i915_error_state_store(error); + i915_gpu_coredump_put(error); } -struct i915_gpu_state * +struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { - struct i915_gpu_state *error; + struct i915_gpu_coredump *error; spin_lock_irq(&i915->gpu_error.lock); error = i915->gpu_error.first_error; if (!IS_ERR_OR_NULL(error)) - i915_gpu_state_get(error); + i915_gpu_coredump_get(error); spin_unlock_irq(&i915->gpu_error.lock); return error; @@ -1886,7 +1899,7 @@ i915_first_error_state(struct drm_i915_private *i915) void i915_reset_error_state(struct drm_i915_private *i915) { - struct i915_gpu_state *error; + struct i915_gpu_coredump *error; spin_lock_irq(&i915->gpu_error.lock); error = i915->gpu_error.first_error; @@ -1895,7 +1908,7 @@ void i915_reset_error_state(struct drm_i915_private *i915) spin_unlock_irq(&i915->gpu_error.lock); if (!IS_ERR_OR_NULL(error)) - i915_gpu_state_put(error); + i915_gpu_coredump_put(error); } void i915_disable_error_state(struct drm_i915_private *i915, int err) |